diff --git a/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh index eb20415e515..b67b614c0cf 100755 --- a/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh +++ b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh @@ -10,19 +10,17 @@ set -e -o pipefail stage=1 mic=ihm nj=30 -min_seg_len=1.55 # min length in seconds... we do this because chain training - # will discard segments shorter than 1.5 seconds. Must remain in sync with - # the same option given to prepare_lores_feats.sh. train_set=train_cleaned # you might set this to e.g. train_cleaned. -gmm=tri3_cleaned # This specifies a GMM-dir from the features of the type you're training the system on; - # it should contain alignments for 'train_set'. - +norvb_datadir=data/ihm/train_cleaned_sp num_threads_ubm=32 rvb_affix=_rvb nnet3_affix=_cleaned # affix for exp/$mic/nnet3 directory to put iVector stuff in, so it # becomes exp/$mic/nnet3_cleaned or whatever. num_data_reps=1 +sample_rate=16000 + +max_jobs_run=10 . ./cmd.sh . ./path.sh @@ -30,10 +28,7 @@ num_data_reps=1 nnet3_affix=${nnet3_affix}$rvb_affix -gmmdir=exp/${mic}/${gmm} - - -for f in data/${mic}/${train_set}/feats.scp ${gmmdir}/final.mdl; do +for f in data/${mic}/${train_set}/feats.scp; do if [ ! -f $f ]; then echo "$0: expected file $f to exist" exit 1 @@ -73,35 +68,22 @@ if [ $stage -le 1 ]; then for datadir in ${train_set}_sp dev eval; do steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" data/$mic/${datadir}_hires + --cmd "$train_cmd --max-jobs-run $max_jobs_run" data/$mic/${datadir}_hires steps/compute_cmvn_stats.sh data/$mic/${datadir}_hires utils/fix_data_dir.sh data/$mic/${datadir}_hires done fi -if [ $stage -le 2 ]; then - echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data" - # we have to combine short segments or we won't be able to train chain models - # on those segments. - utils/data/combine_short_segments.sh \ - data/${mic}/${train_set}_sp_hires $min_seg_len data/${mic}/${train_set}_sp_hires_comb - - # just copy over the CMVN to avoid having to recompute it. - cp data/${mic}/${train_set}_sp_hires/cmvn.scp data/${mic}/${train_set}_sp_hires_comb/ - utils/fix_data_dir.sh data/${mic}/${train_set}_sp_hires_comb/ -fi if [ $stage -le 3 ]; then echo "$0: creating reverberated MFCC features" - datadir=data/ihm/train_cleaned_sp - - mfccdir=${datadir}_rvb${num_data_reps}_hires/data + mfccdir=${norvb_datadir}_rvb${num_data_reps}_hires/data if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage fi - if [ ! -f ${datadir}_rvb${num_data_reps}_hires/feats.scp ]; then + if [ ! -f ${norvb_datadir}_rvb${num_data_reps}_hires/feats.scp ]; then if [ ! -d "RIRS_NOISES" ]; then # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip @@ -123,60 +105,27 @@ if [ $stage -le 3 ]; then --isotropic-noise-addition-probability 1 \ --num-replications ${num_data_reps} \ --max-noises-per-minute 1 \ - --source-sampling-rate 16000 \ - ${datadir} ${datadir}_rvb${num_data_reps} + --source-sampling-rate $sample_rate \ + ${norvb_datadir} ${norvb_datadir}_rvb${num_data_reps} - utils/copy_data_dir.sh ${datadir}_rvb${num_data_reps} ${datadir}_rvb${num_data_reps}_hires - utils/data/perturb_data_dir_volume.sh ${datadir}_rvb${num_data_reps}_hires + utils/copy_data_dir.sh ${norvb_datadir}_rvb${num_data_reps} ${norvb_datadir}_rvb${num_data_reps}_hires + utils/data/perturb_data_dir_volume.sh ${norvb_datadir}_rvb${num_data_reps}_hires steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" ${datadir}_rvb${num_data_reps}_hires - steps/compute_cmvn_stats.sh ${datadir}_rvb${num_data_reps}_hires - utils/fix_data_dir.sh ${datadir}_rvb${num_data_reps}_hires - - utils/data/combine_short_segments.sh \ - ${datadir}_rvb${num_data_reps}_hires $min_seg_len ${datadir}_rvb${num_data_reps}_hires_comb - - # just copy over the CMVN to avoid having to recompute it. - cp ${datadir}_rvb${num_data_reps}_hires/cmvn.scp ${datadir}_rvb${num_data_reps}_hires_comb/ - utils/fix_data_dir.sh ${datadir}_rvb${num_data_reps}_hires_comb/ + --cmd "$train_cmd --max-jobs-run $max_jobs_run" ${norvb_datadir}_rvb${num_data_reps}_hires + steps/compute_cmvn_stats.sh ${norvb_datadir}_rvb${num_data_reps}_hires + utils/fix_data_dir.sh ${norvb_datadir}_rvb${num_data_reps}_hires fi - utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires data/${mic}/${train_set}_sp_hires ${datadir}_rvb${num_data_reps}_hires - utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires_comb data/${mic}/${train_set}_sp_hires_comb ${datadir}_rvb${num_data_reps}_hires_comb + utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires data/${mic}/${train_set}_sp_hires ${norvb_datadir}_rvb${num_data_reps}_hires fi - if [ $stage -le 4 ]; then - echo "$0: selecting segments of hires training data that were also present in the" - echo " ... original training data." - - # note, these data-dirs are temporary; we put them in a sub-directory - # of the place where we'll make the alignments. - temp_data_root=exp/$mic/nnet3${nnet3_affix}/tri5 - mkdir -p $temp_data_root - - utils/data/subset_data_dir.sh --utt-list data/${mic}/${train_set}/feats.scp \ - data/${mic}/${train_set}_sp_hires $temp_data_root/${train_set}_hires - - # note: essentially all the original segments should be in the hires data. - n1=$(wc -l data/$mic/t utils/fix_data_dir.sh data/$mic/train_ihmdata -rm $tmpdir/ihmutt2utt +#rm $tmpdir/ihmutt2utt exit 0; diff --git a/egs/ami/s5b/path.sh b/egs/ami/s5b/path.sh index ad2c93b309b..96eb8328ffc 100644 --- a/egs/ami/s5b/path.sh +++ b/egs/ami/s5b/path.sh @@ -11,3 +11,5 @@ BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt export PATH=$PATH:$LMBIN:$BEAMFORMIT:$SRILM +. /etc/profile.d/modules.sh +module load shared cuda80/toolkit diff --git a/egs/ami/s5b/run.sh b/egs/ami/s5b/run.sh index eacc69a6845..f7044826e6a 100755 --- a/egs/ami/s5b/run.sh +++ b/egs/ami/s5b/run.sh @@ -31,6 +31,7 @@ case $(hostname -d) in fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT, clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, + cm.gemini) AMI_DIR=/export/common/data/corpora/amicorpus;; # COE esac [ ! -r data/local/lm/final_lm ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1 @@ -163,6 +164,8 @@ if [ $stage -le 10 ]; then local/run_cleanup_segmentation.sh --mic $mic fi +exit 0 + if [ $stage -le 11 ]; then ali_opt= [ "$mic" != "ihm" ] && ali_opt="--use-ihm-ali true" diff --git a/egs/ami/s5b/run_prepare_shared.sh b/egs/ami/s5b/run_prepare_shared.sh index 1dc0bf8f20a..f4bfa6ac188 100755 --- a/egs/ami/s5b/run_prepare_shared.sh +++ b/egs/ami/s5b/run_prepare_shared.sh @@ -8,6 +8,7 @@ case $(hostname -d) in fit.vutbr.cz) FISHER_TRANS=/mnt/matylda2/data/FISHER/fe_03_p1_tran ;; # BUT, clsp.jhu.edu) FISHER_TRANS=/export/corpora4/ami/fisher_trans/part1 ;; # JHU, cstr.ed.ac.uk) FISHER_TRANS=`pwd`/eddie_data/lm/data/fisher/part1 ;; # Edinburgh, + cm.gemini) FISHER_TRANS=/export/common/data/corpora/LDC/LDC2004T19_CLSP_format/fe_03_p1_tran/;; # COE *) echo "Please modify the script to add your loaction of the Fisher transcripts, or modify this script."; exit 1;; esac # Or select manually, diff --git a/egs/aspire/s5/conf/mfcc_hires.conf b/egs/aspire/s5/conf/mfcc_hires.conf index d870ab04c38..ee9f9efd92a 100755 --- a/egs/aspire/s5/conf/mfcc_hires.conf +++ b/egs/aspire/s5/conf/mfcc_hires.conf @@ -8,3 +8,4 @@ --num-ceps=40 # there is no dimensionality reduction. --low-freq=40 # low cutoff frequency for mel bins --high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) +--allow-downsample=true diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh index 63d3a7ca988..0d2b3700d39 100755 --- a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh @@ -20,6 +20,7 @@ test_stage=1 nj=70 tdnn_affix=_1a +chain_affix= hidden_dim=1024 cell_dim=1024 @@ -62,9 +63,9 @@ fi train_set=train_rvb gmm_dir=exp/tri5a # used to get training lattices (for chain supervision) -treedir=exp/chain/tree_bi_a -lat_dir=exp/chain/tri5a_${train_set}_lats # training lattices directory -dir=exp/chain/tdnn_lstm${tdnn_affix} +treedir=exp/chain${chain_affix}/tree_bi_a +lat_dir=exp/chain${chain_affix}/tri5a_${train_set}_lats # training lattices directory +dir=exp/chain${chain_affix}/tdnn_lstm${tdnn_affix} train_data_dir=data/${train_set}_hires train_ivector_dir=exp/nnet3/ivectors_${train_set} lang=data/lang_chain @@ -77,7 +78,7 @@ local/nnet3/run_ivector_common.sh --stage $stage --num-data-reps 3 || exit 1 mkdir -p $dir -norvb_lat_dir=exp/chain/tri5a_train_lats +norvb_lat_dir=exp/chain${chain_affix}/tri5a_train_lats if [ $stage -le 7 ]; then # Get the alignments as lattices (gives the chain training more freedom). @@ -257,10 +258,10 @@ if [ $stage -le 15 ]; then for d in dev_rvb test_rvb; do ( - if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + if [ ! -f exp/nnet3${nnet3_affix}/ivectors_${d}/ivector_online.scp ]; then steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ - data/${d}_hires exp/nnet3/extractor \ - exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + data/${d}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } fi decode_dir=$dir/decode_${d}_pp @@ -270,7 +271,7 @@ if [ $stage -le 15 ]; then --extra-right-context $extra_right_context \ --extra-left-context-initial 0 --extra-right-context-final 0 \ --frames-per-chunk 160 \ - --online-ivector-dir exp/nnet3/ivectors_${d} \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${d} \ $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } ) & done @@ -292,7 +293,7 @@ if [ $stage -le 16 ]; then --extra-left-context-initial 0 --extra-right-context-final 0 \ --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ --pass2-decode-opts "--min-active 1000" \ - dev_aspire data/lang $dir/graph_pp $dir + dev_aspire_ldc data/lang $dir/graph_pp $dir fi if [ $stage -le 17 ]; then @@ -305,7 +306,7 @@ if [ $stage -le 17 ]; then --extra-left-context-initial 0 \ --max-count 75 \ --pass2-decode-opts "--min-active 1000" \ - dev_aspire data/lang $dir/graph_pp $dir + dev_aspire_ldc data/lang $dir/graph_pp $dir fi exit 0; diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1b.sh new file mode 100755 index 00000000000..0192e0ce1fe --- /dev/null +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1b.sh @@ -0,0 +1,289 @@ +#!/bin/bash + +set -e + +# based on run_tdnn_7b.sh in the swbd recipe + +# configs for 'chain' +affix=v8 + +stage=0 +train_stage=-10 +get_egs_stage=-10 + +tdnn_affix=1a +tree_affix=bi_a +chain_affix= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $lat_dir/uttlist.$n.$nj + done + + norvb_lat_dir=exp/chain${chain_affix}/tri5a_train_lats + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd JOB=1:$norvb_nj $lat_dir/JOB/copy_lattices.JOB.log \ + lattice-copy "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk '{print "rev"n"_"$1" "$2}' + done > $lat_dir/lat_rvb.scp + + $train_cmd JOB=1:$nj $lat_dir/JOB/copy_rvb_lattices.JOB.log \ + "scp:utils/filter_scp.pl data/${train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:gzip -c > $lat_dir/lat.JOB.gz |" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree; do + cp $norvb_lat_dir/$f $lat_dir/$f + done +fi + +if [ $stage -le 10 ]; then + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + rm -rf $lang + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + # we build the tree using clean features (data/train) rather than + # the augmented features (data/train_rvb) to get better alignments + + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/train $lang exp/tri5a $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 14 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/prep_test_aspire.sh --stage 1 --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire data/lang $dir/graph_pp $dir +fi + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_kl_ts_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_kl_ts_1a.sh new file mode 100644 index 00000000000..590ae64fd97 --- /dev/null +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_kl_ts_1a.sh @@ -0,0 +1,369 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=50 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +tdnn_affix=_1a +chain_affix=_semisup_kl + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +lattice_lm_scale=0.5 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $lat_dir/uttlist.$n.$nj + #done + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $lat_dir/lat.JOB.gz" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_lat_dir/$f ]; then cp $norvb_lat_dir/$f $lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_lat_dir} \ + $src_dir/best_path_${norvb_train_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_train_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${train_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${train_set}/weights.scp +fi + +egs_opts="$egs_opts --deriv-weights-scp $src_dir/best_path_${train_set}/weights.scp" + +if [ $stage -le 14 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_ts.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 17 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 --iter "$decode_iter" \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 22 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp${decode_iter:+_iter$decode_iter} + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 ${decode_iter:+--iter $decode_iter} \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + + +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_kl_ts_1b.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_kl_ts_1b.sh new file mode 100644 index 00000000000..3c5042df975 --- /dev/null +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_kl_ts_1b.sh @@ -0,0 +1,334 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=50 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +tdnn_affix=_1a +chain_affix=_semisup_kl + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $lat_dir/uttlist.$n.$nj + #done + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $lat_dir/lat.JOB.gz" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_lat_dir/$f ]; then cp $norvb_lat_dir/$f $lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_lat_dir} \ + $src_dir/best_path_${norvb_train_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_train_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${train_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${train_set}/weights.scp +fi + +egs_opts="$egs_opts --deriv-weights-scp $src_dir/best_path_${train_set}/weights.scp" + +if [ $stage -le 14 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_ts.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 17 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/prep_test_aspire.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_norvb_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_norvb_1a.sh new file mode 100755 index 00000000000..94ba563ae0f --- /dev/null +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_norvb_1a.sh @@ -0,0 +1,247 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe. + +# configs for 'chain' +affix=1a + +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=200 + +tdnn_affix=1a +tree_affix=bi_a +nnet3_affix=_norvb +chain_affix=_norvb + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev test; do + ( + if [ ! -f exp/nnet3${nnet3_affix}/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi +exit 0 + +#if [ $stage -le 16 ]; then +# local/nnet3/prep_test_aspire.sh --stage 1 --decode-num-jobs 30 --affix "$affix" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp $dir +#fi +#exit 0; diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_semisup_ts_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_semisup_ts_1a.sh new file mode 100755 index 00000000000..2994335f71b --- /dev/null +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_semisup_ts_1a.sh @@ -0,0 +1,367 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=50 + +# seed model params +src_dir=exp/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +use_transcripts=false +tdnn_affix=_1a +chain_affix=_semisup_ts + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $lat_dir/uttlist.$n.$nj + #done + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $lat_dir/lat.JOB.gz" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_lat_dir/$f ]; then cp $norvb_lat_dir/$f $lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_lat_dir} \ + $src_dir/best_path_${norvb_train_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_train_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${train_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${train_set}/weights.scp +fi + +egs_opts="$egs_opts --deriv-weights-scp $src_dir/best_path_${train_set}/weights.scp" + +if [ $stage -le 14 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 17 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/prep_test_aspire.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/fisher_create_test_lang.sh b/egs/aspire/s5/local/fisher_create_test_lang.sh index 6739de822aa..27e0c8f081a 100755 --- a/egs/aspire/s5/local/fisher_create_test_lang.sh +++ b/egs/aspire/s5/local/fisher_create_test_lang.sh @@ -3,47 +3,54 @@ if [ -f path.sh ]; then . ./path.sh; fi -mkdir -p data/lang_test - arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz +large_lm=data/local/lm/4gram-mincount/lm_unpruned.gz +lang=data/lang +dir=data/lang_test + +. utils/parse_options.sh + +mkdir -p $dir + [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; -cp -rT data/lang data/lang_test +cp -rT $lang $dir gunzip -c "$arpa_lm" | \ arpa2fst --disambig-symbol=#0 \ - --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst + --read-symbol-table=$dir/words.txt - $dir/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst +fstisstochastic $dir/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. echo "First few lines of lexicon FST:" -fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head +fstprint --isymbols=$lang/phones.txt --osymbols=$lang/words.txt $lang/L.fst | head echo Performing further checks # Checking that G.fst is determinizable. -fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G. +fstdeterminize $dir/G.fst /dev/null || echo Error determinizing G. # Checking that L_disambig.fst is determinizable. -fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. +fstdeterminize $dir/L_disambig.fst /dev/null || echo Error determinizing L. # Checking that disambiguated lexicon times G is determinizable # Note: we do this with fstdeterminizestar not fstdeterminize, as # fstdeterminize was taking forever (presumbaly relates to a bug # in this version of OpenFst that makes determinization slow for # some case). -fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ +fsttablecompose $dir/L_disambig.fst $dir/G.fst | \ fstdeterminizestar >/dev/null || echo Error # Checking that LG is stochastic: -fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ +fsttablecompose $lang/L_disambig.fst $dir/G.fst | \ fstisstochastic || echo "[log:] LG is not stochastic" - +if [ ! -z "$large_lm" ]; then utils/build_const_arpa_lm.sh \ - data/local/lm/4gram-mincount/lm_unpruned.gz data/lang data/lang_test_fg + $large_lm $lang ${dir}_fg +fi echo "$0 succeeded" diff --git a/egs/aspire/s5/local/fisher_train_lms_pocolm.sh b/egs/aspire/s5/local/fisher_train_lms_pocolm.sh new file mode 100755 index 00000000000..15d2db6fb9d --- /dev/null +++ b/egs/aspire/s5/local/fisher_train_lms_pocolm.sh @@ -0,0 +1,186 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Vimal Manohar +# Apache 2.0 +# +# It is based on the example scripts distributed with PocoLM + +set -e +stage=0 + +text=data/train_all/text +lexicon=data/local/dict/lexicon.txt +dir=data/local/pocolm + +num_ngrams_large=5000000 +num_ngrams_small=2500000 + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +num_dev_sentences=10000 + +#bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.854,0.0722,0.5808,0.338,0.166,0.015,0.999,0.6228,0.340,0.172,0.999,0.788,0.501,0.406" +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + cleantext=$dir/text_all.gz + + cut -d ' ' -f 2- $text | awk -v lex=$lexicon ' + BEGIN{ + while((getline0) { seen[$1]=1; } + } + { + for(n=1; n<=NF;n++) { + if (seen[$n]) { + printf("%s ", $n); + } else { + printf(" "); + } + } + printf("\n"); + }' | gzip -c > $cleantext || exit 1; + + # This is for reporting perplexities + gunzip -c $dir/text_all.gz | head -n $num_dev_sentences > \ + ${dir}/data/test.txt + + # use a subset of the annotated training data as the dev set . + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + gunzip -c $dir/text_all.gz | tail -n +$[num_dev_sentences+1] | \ + head -n $num_dev_sentences > ${dir}/data/text/dev.txt + + gunzip -c $dir/text_all.gz | tail -n +$[2*num_dev_sentences+1] > \ + ${dir}/data/text/train.txt + + cat $lexicon | awk '{print $1}' | sort | uniq | awk ' + { + if ($1 == "") { + print " is in the vocabulary!" | "cat 1>&2" + exit 1; + } + if ($1 == "") { + print " is in the vocabulary!" | "cat 1>&2" + exit 1; + } + printf("%s\n", $1); + }' > $dir/data/wordlist || exit 1; +fi + +order=4 +wordlist=${dir}/data/wordlist + +lm_name="`basename ${wordlist}`_${order}" +min_counts='train=1' +if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" +fi + +unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + --fold-dev-into=train ${bypass_metaparam_optim_opt} \ + --min-counts="${min_counts}" \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} | tee ${unpruned_lm_dir}/train_lm.log + + get_data_prob.py ${dir}/data/test.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' | tee ${unpruned_lm_dir}/perplexity_test.log +fi + +if [ $stage -le 2 ]; then + rm ${dir}/data/arpa/${order}gram_big.arpa.gz 2>/dev/null || true + echo "$0: pruning the LM (to larger size)" + # Using 5 million n-grams for a big LM for rescoring purposes. + prune_lm_dir.py --target-num-ngrams=$num_ngrams_large --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big \ + 2> >(tee -a ${dir}/data/lm_${order}_prune_big/prune_lm.log >&2) || true + + if [ ! -f ${dir}/data/lm_${order}_prune_big/metaparameters ]; then + grep -q "can not do any pruning" ${dir}/data/lm_${order}_prune_big/prune_lm.log + if [ $? -eq 0 ]; then + echo "$0: LM could not be pruned. Something went wrong!" + exit 1 + fi + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz + echo "$0: No pruning necessary as num-ngrams is less than target" + exit 0 + fi + + get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_big/perplexity_test.log + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + rm ${dir}/data/arpa/${order}gram_small.arpa.gz 2>/dev/null || true + echo "$0: pruning the LM (to smaller size)" + # Using 3 million n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + prune_lm_dir.py --target-num-ngrams=$num_ngrams_small ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small \ + 2> >(tee -a ${dir}/data/lm_${order}_prune_small/prune_lm.log >&2) || true + + if [ ! -f ${dir}/data/lm_${order}_prune_small/metaparameters ]; then + grep -q "can not do any pruning" ${dir}/data/lm_${order}_prune_small/prune_lm.log + if [ $? -eq 0 ]; then + echo "$0: LM could not be pruned. Something went wrong!" + exit 1 + fi + + ln -s ${order}gram_big.arpa.gz $dir/data/arpa/${order}gram_small.arpa.gz + exit 0 + fi + + + get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_small/perplexity_test.log + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh index f4366fef679..0af89e46105 100755 --- a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh +++ b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh @@ -31,7 +31,7 @@ fi data_set=$1 segmented_data_set=$2 -if [ "$data_set" == "dev_aspire" ]; then +if [[ "$data_set" =~ "dev_aspire" ]]; then if [ $stage -le 1 ]; then echo "$0: Creating the data dir with whole recordings without segmentation" # create a whole directory without the segments @@ -78,6 +78,5 @@ if [ $stage -le 3 ]; then data/${segmented_data_set}_hires/sub_segments data/${segmented_data_set}_hires steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires - utils/fix_data_dir.sh data/${segmented_data_set}_hires utils/validate_data_dir.sh --no-text data/${segmented_data_set}_hires fi diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh index 56b2de399f2..73a8cf8e718 100755 --- a/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh +++ b/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh @@ -12,6 +12,7 @@ stage=1 train_stage=-10 use_gpu=true +aspire_data= dir=exp/nnet2_multicondition/nnet_ms_a set -e @@ -51,7 +52,7 @@ else fi # do the common parts of the script. -local/multi_condition/run_nnet2_common.sh --stage $stage +local/multi_condition/run_nnet2_common.sh --stage $stage --aspire-data $aspire_data if [ $stage -le 7 ]; then diff --git a/egs/aspire/s5/local/nnet3/prep_test_aspire_segmentation.sh b/egs/aspire/s5/local/nnet3/prep_test_aspire_segmentation.sh new file mode 100755 index 00000000000..9f2fbff3205 --- /dev/null +++ b/egs/aspire/s5/local/nnet3/prep_test_aspire_segmentation.sh @@ -0,0 +1,170 @@ +#!/bin/bash + +# Copyright Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) 2016. Apache 2.0. +# This script generates the ctm files for dev_aspire, test_aspire and eval_aspire +# for scoring with ASpIRE scoring server. +# It also provides the WER for dev_aspire data. + +set -e + +# general opts +iter=final +stage=0 +decode_num_jobs=30 +affix= + +# segmentation opts +sad_affix= +sad_opts="--extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3" +sad_graph_opts= +sad_priors_opts= +sad_stage=0 +segment_only=false + +# ivector opts +max_count=75 # parameter for extract_ivectors.sh +sub_speaker_frames=6000 + +# decode opts +decode_opts="--min-active 1000" +lattice_beam=8 +extra_left_context=0 # change for (B)LSTM +extra_right_context=0 # change for BLSTM +frames_per_chunk=50 # change for (B)LSTM +acwt=0.1 # important to change this when using chain models +post_decode_acwt=1.0 # important to change this when using chain models +extra_left_context_initial=-1 +extra_right_context_final=-1 + +score_opts="--min-lmwt 1 --max-lmwt 20" + +. ./cmd.sh +[ -f ./path.sh ] && . ./path.sh +. utils/parse_options.sh || exit 1; + +if [ $# -ne 6 ]; then + echo "Usage: $0 [options] " + echo " Options:" + echo " --stage (0|1|2) # start scoring script from part-way through." + echo "e.g.:" + echo "$0 dev_aspire data/lang exp/tri5a/graph_pp exp/nnet3/tdnn" + exit 1; +fi + +data_set=$1 #select from {dev_aspire, test_aspire, eval_aspire}* +sad_nnet_dir=$2 +sad_work_dir=$3 +lang=$4 # data/lang +graph=$5 #exp/tri5a/graph_pp +dir=$6 # exp/nnet3/tdnn + +model_affix=`basename $dir` +ivector_root_dir=exp/nnet3 +affix=${affix:+_${affix}}${iter:+_iter${iter}} + +if [[ "$data_set" =~ "test_aspire" ]]; then + out_file=single_dev_test${affix}_$model_affix.ctm + act_data_set=test_aspire +elif [[ "$data_set" =~ "eval_aspire" ]]; then + out_file=single_eval${affix}_$model_affix.ctm + act_data_set=eval_aspire +elif [[ "$data_set" =~ "dev_aspire" ]]; then + # we will just decode the directory without oracle segments file + # as we would like to operate in the actual evaluation condition + out_file=single_dev${affix}_${model_affix}.ctm + act_data_set=dev_aspire +else + echo "$0: Unknown data-set $data_set" + exit 1 +fi + +if [ $stage -le 2 ]; then + steps/segmentation/detect_speech_activity.sh \ + --nj $decode_num_jobs --stage $sad_stage \ + --affix "$sad_affix" --graph-opts "$sad_graph_opts" \ + --transform-probs-opts "$sad_priors_opts" $sad_opts \ + data/$data_set $sad_nnet_dir mfcc_hires $sad_work_dir \ + $sad_work_dir/${data_set}${sad_affix:+_$sad_affix} || exit 1 +fi + +segmented_data_set=${data_set}${sad_affix:+_$sad_affix} + +if [ $stage -le 3 ]; then + if [ -f data/$act_data_set/ref.rttm ]; then + if [ ! -f $sad_work_dir/${segmented_data_set}_seg/reco2file_and_channel ]; then + awk '{print $2" "1}' $sad_work_dir/${segmented_data_set}_seg/segments | \ + sort -u > $sad_work_dir/${segmented_data_set}_seg/reco2file_and_channel + fi + + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + --reco2file-and-channel=${sad_work_dir}/${segmented_data_set}_seg/reco2file_and_channel \ + ${sad_work_dir}/${segmented_data_set}_seg/{utt2spk,segments,sys.rttm} || exit 1 + + export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin + md-eval.pl -c 0.25 -r data/dev_aspire/ref.rttm \ + -s ${sad_work_dir}/${segmented_data_set}_seg/sys.rttm > \ + ${sad_work_dir}/${segmented_data_set}_seg/md_eval.log + fi +fi + +if [ $stage -le 4 ]; then + utils/copy_data_dir.sh $sad_work_dir/${segmented_data_set}_seg \ + data/${segmented_data_set}_hires + steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires + utils/fix_data_dir.sh data/${segmented_data_set}_hires +fi + +if $segment_only; then + echo "$0: --segment-only is true. Exiting." + exit 0 +fi + +if [ $stage -le 5 ]; then + echo "Extracting i-vectors" + # this does offline decoding. + # the --sub-speaker-frames is optional; if provided, it will divide each speaker + # up into "sub-speakers" of at least that many frames... can be useful if + # acoustic conditions drift over time within the speaker's data. + steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $decode_num_jobs \ + --sub-speaker-frames $sub_speaker_frames --max-count $max_count \ + data/${segmented_data_set}_hires $lang $ivector_root_dir/extractor \ + $ivector_root_dir/ivectors_${segmented_data_set} +fi + +decode_dir=$dir/decode_${segmented_data_set}${affix}_pp +if [ $stage -le 6 ]; then + echo "Generating lattices" + rm -f ${decode_dir}_tg/.error + steps/nnet3/decode.sh --nj $decode_num_jobs --cmd "$decode_cmd" --config conf/decode.config \ + --acwt $acwt --post-decode-acwt $post_decode_acwt $decode_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk" \ + --skip-scoring true --iter $iter --lattice-beam $lattice_beam \ + --online-ivector-dir $ivector_root_dir/ivectors_${segmented_data_set} \ + $graph data/${segmented_data_set}_hires ${decode_dir}_tg || \ + { echo "$0: Error decoding" && exit 1; } +fi + +if [ $stage -le 7 ]; then + echo "Rescoring lattices" + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + --skip-scoring true \ + ${lang}_pp_test{,_fg} data/${segmented_data_set}_hires \ + ${decode_dir}_{tg,fg}; +fi + +decode_dir=${decode_dir}_fg + +if [ $stage -le 8 ]; then + local/score_aspire.sh --cmd "$decode_cmd" \ + $score_opts \ + --word-ins-penalties "0.0,0.25,0.5,0.75,1.0" \ + --ctm-beam 6 \ + --iter $iter \ + --decode-mbr true \ + --tune-hyper true \ + $lang $decode_dir $act_data_set $segmented_data_set $out_file +fi diff --git a/egs/aspire/s5/local/nnet3/segment_and_decode.sh b/egs/aspire/s5/local/nnet3/segment_and_decode.sh index d66b72200c1..e8917d091e2 100755 --- a/egs/aspire/s5/local/nnet3/segment_and_decode.sh +++ b/egs/aspire/s5/local/nnet3/segment_and_decode.sh @@ -109,9 +109,9 @@ fi if [ $stage -le 4 ]; then utils/copy_data_dir.sh $sad_work_dir/${segmented_data_set}_seg \ - data/${segmented_data_set}_hires - steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires - utils/fix_data_dir.sh data/${segmented_data_set}_hires + data/${segmented_data_set}_seg_hires + steps/compute_cmvn_stats.sh data/${segmented_data_set}_seg_hires + utils/fix_data_dir.sh data/${segmented_data_set}_seg_hires fi if [ $stage -le 5 ]; then @@ -122,11 +122,11 @@ if [ $stage -le 5 ]; then # acoustic conditions drift over time within the speaker's data. steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $decode_num_jobs \ --sub-speaker-frames $sub_speaker_frames --max-count $max_count \ - data/${segmented_data_set}_hires $lang $ivector_root_dir/extractor \ - $ivector_root_dir/ivectors_${segmented_data_set} + data/${segmented_data_set}_seg_hires $lang $ivector_root_dir/extractor \ + $ivector_root_dir/ivectors_${segmented_data_set}_seg fi -decode_dir=$dir/decode_${segmented_data_set}${affix}_pp +decode_dir=$dir/decode_${segmented_data_set}_seg${affix}_pp if [ $stage -le 6 ]; then echo "Generating lattices" rm -f ${decode_dir}_tg/.error @@ -138,8 +138,8 @@ if [ $stage -le 6 ]; then --extra-right-context-final $extra_right_context_final \ --frames-per-chunk "$frames_per_chunk" \ --skip-scoring true ${iter:+--iter $iter} --lattice-beam $lattice_beam \ - --online-ivector-dir $ivector_root_dir/ivectors_${segmented_data_set} \ - $graph data/${segmented_data_set}_hires ${decode_dir}_tg || \ + --online-ivector-dir $ivector_root_dir/ivectors_${segmented_data_set}_seg \ + $graph data/${segmented_data_set}_seg_hires ${decode_dir}_tg || \ { echo "$0: Error decoding" && exit 1; } fi @@ -147,7 +147,7 @@ if [ $stage -le 7 ]; then echo "Rescoring lattices" steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ --skip-scoring true \ - ${lang}_pp_test{,_fg} data/${segmented_data_set}_hires \ + ${lang}_pp_test{,_fg} data/${segmented_data_set}_seg_hires \ ${decode_dir}_{tg,fg}; fi @@ -161,5 +161,5 @@ if [ $stage -le 8 ]; then ${iter:+--iter $iter} \ --decode-mbr true \ --tune-hyper true \ - $lang $decode_dir $act_data_set $segmented_data_set $out_file + $lang $decode_dir $act_data_set ${segmented_data_set}_seg $out_file fi diff --git a/egs/aspire/s5/local/run_asr_segmentation.sh b/egs/aspire/s5/local/run_asr_segmentation.sh index de0a925a242..e5dbdf9fbec 100755 --- a/egs/aspire/s5/local/run_asr_segmentation.sh +++ b/egs/aspire/s5/local/run_asr_segmentation.sh @@ -214,7 +214,7 @@ if [ $stage -le 9 ]; then # the chain nnet # Increase sil-scale to predict more silence local/nnet3/prep_test_aspire_segmentation.sh --stage $test_stage \ - --decode-num-jobs $test_nj --affix "${test_affix}" \ + --decode-num-jobs $test_nj --sad-affix "${test_affix}" --affix "${test_affix}" \ --sad-opts "$sad_opts" \ --sad-graph-opts "--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0" --sad-priors-opts "--sil-scale=0.1" \ --acwt 1.0 --post-decode-acwt 10.0 \ diff --git a/egs/aspire/s5/local/semisup/build_silprob.sh b/egs/aspire/s5/local/semisup/build_silprob.sh new file mode 100755 index 00000000000..c51e3ea05e3 --- /dev/null +++ b/egs/aspire/s5/local/semisup/build_silprob.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -e + +. ./cmd.sh +. ./path.sh + +steps/get_prons.sh --cmd "$train_cmd" data/train_300k data/lang exp/semisup300k/tri5b + +utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict exp/semisup300k/tri5b/pron_counts_nowb.txt \ + exp/semisup300k/tri5b/sil_counts_nowb.txt \ + exp/semisup300k/tri5b/pron_bigram_counts_nowb.txt data/local/dict_300k_pp + +utils/prepare_lang.sh data/local/dict_300k_pp "" data/local/lang_300k_pp data/lang_300k_pp diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_1a.sh new file mode 100755 index 00000000000..cd0abc6792a --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_1a.sh @@ -0,0 +1,327 @@ +#!/bin/bash + +set -e + +# based on run_tdnn_7b.sh in the swbd recipe + +# configs for 'chain' +affix=v8 + +stage=0 +train_stage=-10 +get_egs_stage=-10 +test_stage=1 +nj=70 + +train_set=train_300k +exp=exp/semisup300k +gmm=tri5a + +tdnn_affix=_1a +tree_affix=bi_a +chain_affix= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/uttlist + + utils/subset_data_dir.sh --utt-list $dir/uttlist data/train_rvb_hires \ + data/${rvb_train_set}_hires + + utils/subset_data_dir.sh --utt-list $dir/uttlist data/train_rvb \ + data/${rvb_train_set} +fi + +norvb_lat_dir=${exp}/chain${chain_affix}/${gmm}_train_lats + +if [ $stage -le 8 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 30 --cmd "$train_cmd" \ + --generate-ali-from-lats true data/$train_set \ + data/lang $gmm_dir $norvb_lat_dir || exit 1; + rm $norvb_lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 9 ]; then + mkdir -p $lat_dir + + utils/split_data.sh data/${rvb_train_set} $nj + + for n in `seq $nj`; do + awk '{print $1}' data/${rvb_train_set}/split$nj/$n/utt2spk | \ + perl -ane 's/rev[1-3]_//g' > $lat_dir/uttlist.$n.$nj + done + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd JOB=1:$norvb_nj $lat_dir/log/copy_lattices.JOB.log \ + lattice-copy "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done > $lat_dir/lat_rvb.scp + + $train_cmd JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy \ + "scp:utils/filter_scp.pl data/${rvb_train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $lat_dir/lat.JOB.gz" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree; do + cp $norvb_lat_dir/$f $lat_dir/$f + done +fi + +if [ $stage -le 10 ]; then + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + rm -rf $lang + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + # we build the tree using clean features (data/train) rather than + # the augmented features (data/train_rvb) to get better alignments + + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set} $lang ${exp}/${gmm} $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + +exit 0 + +if [ $stage -le 16 ]; then + local/nnet3/prep_test_aspire.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 17 ]; then +# #Online decoding example + + local/nnet3/prep_test_aspire_online.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --max-count 75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + + + + +exit 0; + diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_1b.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_1b.sh new file mode 100755 index 00000000000..896129cb941 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_1b.sh @@ -0,0 +1,355 @@ +#!/bin/bash + +set -e + +# based on run_tdnn_7b.sh in the swbd recipe + +# configs for 'chain' +affix=v8 + +stage=0 +train_stage=-10 +get_egs_stage=-10 +test_stage=1 +nj=70 + +train_set=train_300k +exp=exp/semisup300k +gmm=tri5a + +tdnn_affix=_1b +tree_affix=bi_b +chain_affix= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/uttlist + + utils/subset_data_dir.sh --utt-list $dir/uttlist data/train_rvb_hires \ + data/${rvb_train_set}_hires + + utils/subset_data_dir.sh --utt-list $dir/uttlist data/train_rvb \ + data/${rvb_train_set} +fi + +if [ $stage -le 9 ]; then + utils/data/perturb_data_dir_speed_3way.sh data/${rvb_train_set}_hires \ + data/${rvb_train_set}_sp_hires + utils/data/perturb_data_dir_volume.sh data/${rvb_train_set}_sp_hires + + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires.conf \ + data/${rvb_train_set}_sp_hires + steps/compute_cmvn_stats.sh data/${rvb_train_set}_sp_hires +fi + +if [ $stage -le 10 ]; then + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} \ + data/${train_set}_sp + + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 \ + data/${train_set}_sp + steps/compute_cmvn_stats.sh data/${train_set}_sp +fi + +norvb_lat_dir=${exp}/chain${chain_affix}/${gmm}_${train_set}_sp_lats + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 30 --cmd "$train_cmd" \ + --generate-ali-from-lats true data/${train_set}_sp \ + data/lang $gmm_dir $norvb_lat_dir || exit 1; + rm $norvb_lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + mkdir -p $lat_dir + + utils/split_data.sh data/${rvb_train_set}_sp_hires $nj + + for n in `seq $nj`; do + awk '{print $1}' data/${rvb_train_set}_sp_hires/split$nj/$n/utt2spk | \ + perl -ane 's/rev[1-3]_//g' > $lat_dir/uttlist.$n.$nj + done + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd JOB=1:$norvb_nj $lat_dir/log/copy_lattices.JOB.log \ + lattice-copy "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | \ + perl -pe 's:(rev[1-3])_(sp0.9|sp1.1)-:\2-\1_:g' | sort -k1,1 > $lat_dir/lat_rvb.scp + + $train_cmd JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy \ + "scp:utils/filter_scp.pl data/${rvb_train_set}_sp_hires/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $lat_dir/lat.JOB.gz" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree; do + cp $norvb_lat_dir/$f $lat_dir/$f + done +fi + +if [ $stage -le 13 ]; then + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + rm -rf $lang + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. + # we build the tree using clean features (data/train) rather than + # the augmented features (data/train_rvb) to get better alignments + + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $norvb_lat_dir $treedir || exit 1 +fi + +if [ $stage -le 15 ]; then + if [ ! -f exp/nnet3/ivectors_${rvb_train_set}_sp/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${rvb_train_set}_sp_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${rvb_train_set}_sp || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi +fi + +if [ $stage -le 16 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 18 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 19 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + +exit 0 + +if [ $stage -le 19 ]; then + local/nnet3/prep_test_aspire.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 17 ]; then +# #Online decoding example + + local/nnet3/prep_test_aspire_online.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --max-count 75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + + + + +exit 0; + diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_1b.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_1b.sh new file mode 100755 index 00000000000..da1c9962c92 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_1b.sh @@ -0,0 +1,463 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +test_stage=0 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +extractor=exp/nnet3/extractor + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp + +norvb_unsupervised_set=train +unsupervised_set=train_rvb + +tdnn_affix=_1b +chain_affix=_kl_ts + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +lm_weights=1,3 # 3 - To compensate for using alignments before reverberation. +supervision_weights=1.0,1.0 +num_copies=2,1 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $unsup_lat_dir/uttlist.$n.$nj + #done + + rm -f $unsup_lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_unsup_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $unsup_lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_unsup_lat_dir/lat.JOB.gz |" \ + ark,scp:$unsup_lat_dir/lat_tmp.JOB.ark,$unsup_lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $unsup_lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $unsup_lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $unsup_lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${unsupervised_set}_hires/split$nj/JOB/utt2spk $unsup_lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $unsup_lat_dir/lat.JOB.gz" || exit 1 + + rm $unsup_lat_dir/lat_tmp.* $unsup_lat_dir/lat_rvb.scp + + echo $nj > $unsup_lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_unsup_lat_dir/$f ]; then cp $norvb_unsup_lat_dir/$f $unsup_lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $unsup_lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_unsup_lat_dir} \ + $src_dir/best_path_${norvb_unsupervised_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_unsupervised_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${unsupervised_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${unsupervised_set}/weights.scp +fi + +deriv_weights_scp=$src_dir/best_path_${unsupervised_set}/weights.scp + +if [ $stage -le 14 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${src_dir}/best_path_${norvb_unsupervised_set} $dir +fi + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 +lattice_lm_scale=0.5 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $(dirname $extractor)/ivectors_${supervised_set} \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_ts.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam 4.0 \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "output-0=0,0 output-1=1,1" \ + --chain.mmi-factor-schedule "output-0=1,1 output-1=0,0" \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir $dir/egs_comb \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 21 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 22 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + + +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_1a.sh new file mode 100755 index 00000000000..9436e56f0a2 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_1a.sh @@ -0,0 +1,290 @@ +#!/bin/bash + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +student_rescore_lang=data/lang_pp_test_fg + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +tdnn_affix=_1a +chain_affix=_kl_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0 --deriv-weights-scp $deriv_weights_scp" + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train_ts.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_1b.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_1b.sh new file mode 100755 index 00000000000..37977e56ba7 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_1b.sh @@ -0,0 +1,292 @@ +#!/bin/bash + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +student_rescore_lang=data/lang_pp_test_fg + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +tdnn_affix=_1b +chain_affix=_kl_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +lattice_lm_scale=0.5 + +remove_egs=false +common_egs_dir= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +echo "use the other script" +exit 1 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="--lattice-lm-scale $lattice_lm_scale --lattice-prune-beam 4.0 --deriv-weights-scp $deriv_weights_scp" + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train_ts.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_subset_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_subset_1a.sh new file mode 100755 index 00000000000..083b54a61e2 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_subset_1a.sh @@ -0,0 +1,420 @@ +#!/bin/bash + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +supervised_data_dir=data/ami_sdm1_train_reco12 +unsupervised_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +student_rescore_lang=data/lang_pp_test_fg + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +supervision_weights=1.0,1.0 +num_copies=1,1 +lm_weights=1,1 + +tdnn_affix=_1a +chain_affix=_semisup_ts_ami_subset_sdm1 +nnet3_affix=_semisup_ts_ami_subset_sdm1 + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +lattice_lm_scale=0.5 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set}_sp + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $unsup_student_data_dir) \ + --generate-egs-scp true \ + $sup_student_data_dir $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_frames_per_eg=150 +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_ts.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam 4.0 \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $unsup_student_data_dir) \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_sp_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $unsup_student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $unsup_student_data_dir \ + --tree-dir $treedir \ + --lat-dir $unsup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 21 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_wt_ami_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_wt_ami_1a.sh new file mode 100755 index 00000000000..0004a29f188 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_wt_ami_1a.sh @@ -0,0 +1,286 @@ +#!/bin/bash + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +student_rescore_lang=data/lang_pp_test_fg + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +tdnn_affix=_1a +chain_affix=_kl_wt_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +primary_lr_factor=0.25 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +# decode options +test_sets="ami_sdm1_dev ami_sdm1_eval" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig +# input dim=100 name=ivector +# input dim=40 name=input +# +# # please note that it is important to have input layer with the name=input +# # as the layer immediately preceding the fixed-affine-layer to enable +# # the use of short notation for the descriptor +# fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat +# +# # the first splicing is moved before the lda layer, so no splicing here +# relu-batchnorm-layer name=tdnn1 dim=$hidden_dim +# relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim +# +# fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# +# ## adding the layers for chain branch +# output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 +# +# # adding the layers for xent branch +# # This block prints the configs for a separate output that will be +# # trained with a cross-entropy objective in the 'chain' models... this +# # has the effect of regularizing the hidden parts of the model. we use +# # 0.5 / args.xent_regularize as the learning rate factor- the factor of +# # 0.5 / args.xent_regularize is suitable as it means the xent +# # final-layer learns at a rate independent of the regularization +# # constant; and the 0.5 was tuned so as to make the relative progress +# # similar in the xent and regular final layers. +# output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +# +# output name=output-0 input=output.affine@$label_delay skip-in-init=true +# output name=output-1 input=output.affine@$label_delay skip-in-init=true +# +# output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +# output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +#EOF +# steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +#fi + +if [ $stage -le 12 ]; then + # Set the learning-rate-factor for all transferred layers but the last output + # layer to primary_lr_factor. + $train_cmd $dir/log/generate_input_mdl.log \ + nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ + $src_dir/final.mdl $dir/input.raw || exit 1; +fi + +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0 --deriv-weights-scp $deriv_weights_scp" + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train_ts.py --stage $train_stage \ + --cmd "$decode_cmd" --trainer.input-model $dir/input.raw \ + --feat.online-ivector-dir $teacher_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $teacher_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $(dirname $src_ivector_extractor)/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_norvb_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_norvb_1a.sh new file mode 100755 index 00000000000..527fe36bb37 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_norvb_1a.sh @@ -0,0 +1,288 @@ +#!/bin/bash + +set -e + +# based on run_tdnn_7b.sh in the swbd recipe + +# configs for 'chain' +affix=v8 + +stage=0 +train_stage=-10 +get_egs_stage=-10 +test_stage=1 +nj=70 + +train_set=train_300k +exp=exp/semisup300k +gmm=tri5a + +tdnn_affix=_1a +tree_affix=bi_a +nnet3_affix=_norvb +chain_affix=_norvb + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/uttlist + + utils/subset_data_dir.sh --utt-list $dir/uttlist data/train_sp_hires \ + data/${train_set}_sp_hires + + utils/subset_data_dir.sh --utt-list $dir/uttlist data/train_sp \ + data/${train_set}_sp +fi + +if [ $stage -le 8 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 30 --cmd "$train_cmd" \ + --generate-ali-from-lats true data/${train_set}_sp \ + data/lang $gmm_dir $lat_dir || exit 1; + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 10 ]; then + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + rm -rf $lang + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev test; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + +exit 0 + +if [ $stage -le 16 ]; then + local/nnet3/prep_test_aspire.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 17 ]; then +# #Online decoding example + + local/nnet3/prep_test_aspire_online.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --max-count 75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + + + + +exit 0; + diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1a.sh new file mode 100755 index 00000000000..9bdd1c99c65 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1a.sh @@ -0,0 +1,370 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +test_stage=0 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +tdnn_affix=_1a +chain_affix=_semisup_ts + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $lat_dir/uttlist.$n.$nj + #done + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $lat_dir/lat.JOB.gz" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_lat_dir/$f ]; then cp $norvb_lat_dir/$f $lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_lat_dir} \ + $src_dir/best_path_${norvb_train_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_train_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${train_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${train_set}/weights.scp +fi + +egs_opts="$egs_opts --deriv-weights-scp $src_dir/best_path_${train_set}/weights.scp" + +if [ $stage -le 14 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 17 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 22 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1b.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1b.sh new file mode 100755 index 00000000000..d2affd9372d --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1b.sh @@ -0,0 +1,460 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +test_stage=0 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +extractor=exp/nnet3/extractor + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp + +norvb_unsupervised_set=train +unsupervised_set=train_rvb + +tdnn_affix=_1b +chain_affix=_semisup_ts + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +lm_weights=1,3 # 3 - To compensate for using alignments before reverberation. +supervision_weights=1.0,1.0 +num_copies=2,1 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $unsup_lat_dir/uttlist.$n.$nj + #done + + rm -f $unsup_lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_unsup_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $unsup_lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_unsup_lat_dir/lat.JOB.gz |" \ + ark,scp:$unsup_lat_dir/lat_tmp.JOB.ark,$unsup_lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $unsup_lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $unsup_lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $unsup_lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${unsupervised_set}_hires/split$nj/JOB/utt2spk $unsup_lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $unsup_lat_dir/lat.JOB.gz" || exit 1 + + rm $unsup_lat_dir/lat_tmp.* $unsup_lat_dir/lat_rvb.scp + + echo $nj > $unsup_lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_unsup_lat_dir/$f ]; then cp $norvb_unsup_lat_dir/$f $unsup_lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $unsup_lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_unsup_lat_dir} \ + $src_dir/best_path_${norvb_unsupervised_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_unsupervised_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${unsupervised_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${unsupervised_set}/weights.scp +fi + +deriv_weights_scp=$src_dir/best_path_${unsupervised_set}/weights.scp + +if [ $stage -le 14 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${src_dir}/best_path_${norvb_unsupervised_set} $dir +fi + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $(dirname $extractor)/ivectors_${supervised_set} \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale 0.5 \ + --lattice-prune-beam 4.0 \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir $dir/egs_comb \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 21 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 22 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1c.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1c.sh new file mode 100755 index 00000000000..1893fa7772f --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1c.sh @@ -0,0 +1,469 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +test_stage=0 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +extractor=exp/nnet3/extractor + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp + +norvb_unsupervised_set=train +unsupervised_set=train_rvb + +tdnn_affix=_1c +chain_affix=_semisup_ts + +kl_factor_schedule="output-0=0,0 output-1=0,0" +mmi_factor_schedule="output-0=1,1 output-1=1,1" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +lm_weights=1,3 # 3 - To compensate for using alignments before reverberation. +supervision_weights=1.0,1.0 +num_copies=2,1 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $unsup_lat_dir/uttlist.$n.$nj + #done + + rm -f $unsup_lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_unsup_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $unsup_lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_unsup_lat_dir/lat.JOB.gz |" \ + ark,scp:$unsup_lat_dir/lat_tmp.JOB.ark,$unsup_lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $unsup_lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $unsup_lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $unsup_lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${unsupervised_set}_hires/split$nj/JOB/utt2spk $unsup_lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $unsup_lat_dir/lat.JOB.gz" || exit 1 + + rm $unsup_lat_dir/lat_tmp.* $unsup_lat_dir/lat_rvb.scp + + echo $nj > $unsup_lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_unsup_lat_dir/$f ]; then cp $norvb_unsup_lat_dir/$f $unsup_lat_dir/$f; fi + done + + ln -sf ../final.mdl $unsup_lat_dir/final.mdl +fi + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} ${norvb_unsup_lat_dir} \ + $src_dir/best_path_${norvb_unsupervised_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_unsupervised_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${unsupervised_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${unsupervised_set}/weights.scp +fi + +deriv_weights_scp=$src_dir/best_path_${unsupervised_set}/weights.scp + +if [ $stage -le 14 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${src_dir}/best_path_${norvb_unsupervised_set} $dir +fi + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $(dirname $extractor)/ivectors_${supervised_set} \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +lattice_lm_scale=0.5 +kl_fst_scale=0.5 + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam 4.0 --kl-latdir $unsup_lat_dir --kl-fst-scale $kl_fst_scale \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir $dir/egs_comb \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 21 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi + + exit 1 +fi + +if [ $stage -le 22 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1d.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1d.sh new file mode 100755 index 00000000000..ebe52330554 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1d.sh @@ -0,0 +1,484 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +test_stage=0 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +extractor=exp/nnet3/extractor + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp + +norvb_unsupervised_set=train +unsupervised_set=train_rvb + +tdnn_affix=_1c +chain_affix=_semisup_ts + +kl_factor_schedule="output-0=0,0 output-1=0,0" +mmi_factor_schedule="output-0=1,1 output-1=1,1" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +lm_weights=1,3 # 3 - To compensate for using alignments before reverberation. +supervision_weights=1.0,1.0 +num_copies=2,1 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $src_dir/best_path_${unsupervised_set}/weights.scp +fi + +deriv_weights_scp=$src_dir/best_path_${unsupervised_set}/weights.scp + +kl_decode_graph_dir=$src_dir/graph${kl_decode_graph_affix} + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${kl_decode_lang} $src_dir $kl_decode_graph_dir +fi + +if [ $stage -le 15 ]; then + steps/nnet3/decode_semisup.sh --sub-split $nj --nj $nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --write-compact true --word-determinize false \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $unsup_src_ivector_dir \ + --skip-scoring true \ + $kl_decode_graph_dir data/${norvb_unsupervised_set}_hires $norvb_unsup_kl_lat_dir || exit 1 +fi + +if [ $stage -le 16 ]; then + utt_prefixes= + for n in $(seq $num_data_reps); do + utt_prefixes="$utt_prefixes rev${n}_" + done + + local/semisup/copy_lat_dir.sh --write-compact true \ + --nj $nj --utt_prefixes "$utt_prefixes" \ + data/${unsupervised_set}_hires \ + ${norvb_unsup_kl_lat_dir} ${unsup_kl_lat_dir} + + ln -sf ../final.mdl $unsup_kl_lat_dir/final.mdl +fi + +if [ $stage -le 17 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${src_dir}/best_path_${norvb_unsupervised_set} $dir +fi + +if [ $stage -le 18 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 19 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $(dirname $extractor)/ivectors_${supervised_set} \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +lattice_lm_scale=0.5 +kl_fst_scale=0.5 + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 20 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam 4.0 --kl-latdir $unsup_kl_lat_dir --kl-fst-scale $kl_fst_scale \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 21 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 22 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir $dir/egs_comb \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 23 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 24 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi + + exit 1 +fi + +if [ $stage -le 25 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1a.sh new file mode 100755 index 00000000000..3bb40618983 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1a.sh @@ -0,0 +1,292 @@ +#!/bin/bash + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +student_rescore_lang=data/lang_pp_test_fg + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +tdnn_affix=_1a +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0 --deriv-weights-scp $deriv_weights_scp" + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1b.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1b.sh new file mode 100755 index 00000000000..97bf6950664 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1b.sh @@ -0,0 +1,292 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script is similar to _a, but uses one less group of +# TDNN + LSTM layer + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +tdnn_affix=_1b +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.0,0.0" +mmi_factor_schedule="output=1.0,1.0" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="$egs_opts --deriv-weights-scp $deriv_weights_scp $egs_opts" + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1c.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1c.sh new file mode 100755 index 00000000000..164fa929052 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1c.sh @@ -0,0 +1,333 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script is similar to _b, but supports using different lattices +# for KL training, usually generated using a unigram LM. +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +kl_student_graph_affix=_pp +kl_student_lang=data/lang_pp_test + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +lm_weights=1,0 # src, tgt weight + +tdnn_affix=_1b +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.0,0.0" +mmi_factor_schedule="output=1.0,1.0" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +kl_fst_scale=0.5 +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="$egs_opts --deriv-weights-scp $deriv_weights_scp --kl-latdir $kl_lat_dir --kl-fst-scale $kl_fst_scale" + +if [ $stage -le 15 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights \ + --cmd "$train_cmd" \ + $treedir $src_dir/best_path${student_graph_affix}_${tgt_dataset}_sp \ + $dir +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 16 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd --mem 4G" --combine-queue-opt "--h-rt 00:59:00" --train-queue-opt "--h-rt 00:15:00" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 17 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1d.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1d.sh new file mode 100755 index 00000000000..e59e26c1fb6 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1d.sh @@ -0,0 +1,331 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script supports using different lattices +# for KL training, usually generated using a unigram LM. +# This script is similar to _c, but updates existing teacher model instead +# of training from scratch. + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +kl_student_graph_affix=_pp +kl_student_lang=data/lang_pp_test + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +tdnn_affix=_1c +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.0,0.0" +mmi_factor_schedule="output=1.0,1.0" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +kl_fst_scale=0.5 +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +primary_lr_factor=0.1 + +if [ $stage -le 14 ]; then + $train_cmd $dir/log/generate_input_mdl.log \ + nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ + $src_dir/final.mdl $dir/input.raw || exit 1; +fi + +egs_opts="$egs_opts --deriv-weights-scp $deriv_weights_scp --kl-latdir $kl_lat_dir --kl-fst-scale $kl_fst_scale" + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" --trainer.input-model $dir/input.raw \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 17 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1e.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1e.sh new file mode 100755 index 00000000000..5a3b7a3f0e8 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1e.sh @@ -0,0 +1,328 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script is similar to _d, but uses phone LM graph to compute +# numerator posteriors for KL objective. This script does weights +# transfer to update the nnet to target domain. + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +lm_weights=1,0 # src, tgt weight + +tdnn_affix=_1e +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.0,0.0" +mmi_factor_schedule="output=1.0,1.0" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +primary_lr_factor=0.1 + +if [ $stage -le 14 ]; then + $train_cmd $dir/log/generate_input_mdl.log \ + nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ + $src_dir/final.mdl $dir/input.raw || exit 1; +fi + +egs_opts="$egs_opts --deriv-weights-scp $deriv_weights_scp --graph-posterior-rspecifier scp:$graph_post_dir/numerator_post.scp" + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" --trainer.input-model $dir/input.raw \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 17 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1f.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1f.sh new file mode 100755 index 00000000000..e29f20d3b46 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1f.sh @@ -0,0 +1,323 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script uses phone LM graph to compute numerator posteriors for +# KL objective. +# This script is same as _e, but trains neural network from scratch. + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +lm_weights=1,0 # src, tgt weight + +tdnn_affix=_1f +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.0,0.0" +mmi_factor_schedule="output=1.0,1.0" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="$egs_opts --deriv-weights-scp $deriv_weights_scp --graph-posterior-rspecifier scp:$graph_post_dir/numerator_post.scp" + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 17 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1g.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1g.sh new file mode 100755 index 00000000000..4f862f62a2d --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1g.sh @@ -0,0 +1,323 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script uses phone LM graph to compute numerator posteriors for +# KL objective. +# This script is same as _e, but trains neural network from scratch. + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +lm_weights=1,0 # src, tgt weight + +tdnn_affix=_1g +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.0,0.0" +mmi_factor_schedule="output=1.0,1.0" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="$egs_opts --deriv-weights-scp $deriv_weights_scp --graph-posterior-rspecifier scp:$graph_post_dir/numerator_post.scp" + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 17 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1h.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1h.sh new file mode 100755 index 00000000000..0ab38f8b2f3 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1h.sh @@ -0,0 +1,307 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script is similar to _b, but supports generates numerator posteriors +# after splitting egs. +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +lm_weights=1,0 # src, tgt weight + +tdnn_affix=_1h +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.0,0.0" +mmi_factor_schedule="output=1.0,1.0" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +kl_fst_scale=0.5 +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="$egs_opts --deriv-weights-scp $deriv_weights_scp --add-numerator-post true" + +if [ $stage -le 15 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights \ + --cmd "$train_cmd" \ + $treedir $src_dir/best_path${student_graph_affix}_${tgt_dataset}_sp \ + $dir +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 16 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 17 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1i.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1i.sh new file mode 100755 index 00000000000..8f1ad853201 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1i.sh @@ -0,0 +1,420 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script is similar to _b, but supports generates numerator posteriors +# after splitting egs. +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp +supervision_weights=1,1 +num_copies=1,1 + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +lm_weights=1,3 # src, tgt weight + +tdnn_affix=_1h +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output-0=0,0 output-1=0,0" +mmi_factor_schedule="output-0=1,1 output-1=1,1" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_frames_per_eg=150 + +lattice_lm_scale=0.5 +kl_fst_scale=0.5 +unsup_egs_opts="" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +frame_subsampling_factor=1 +if [ -f $src_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $src_dir/frame_subsampling_factor) || exit 1 +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${supervised_set}_16kHz_${tgt_dataset}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_16kHz_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_egs_opts="$unsup_egs_opts --deriv-weights-scp $deriv_weights_scp --add-numerator-post true" + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${tgt_dataset}_sp + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam 4.0 --kl-latdir $unsup_lat_dir --kl-fst-scale $kl_fst_scale \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${supervised_set}_16kHz_${tgt_dataset}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + $student_data_dir $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd --mem 4G --h-rt 00:15:00" --combine-queue-opt "--h-rt 00:50:00" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${supervised_set}_16kHz_${tgt_dataset}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $unsup_egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$dir/egs_comb" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 21 ]; then + rm -f $dir/.error + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1j.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1j.sh new file mode 100755 index 00000000000..428105469fa --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1j.sh @@ -0,0 +1,421 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script is similar to _b, but supports generates numerator posteriors +# after splitting egs. +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp +supervision_weights=1,1 +num_copies=1,1 + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +lm_weights=1,3 # src, tgt weight + +tdnn_affix=_1j +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output-0=0,0 output-1=0,0" +mmi_factor_schedule="output-0=1,1 output-1=1,1" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_frames_per_eg=150 + +lattice_lm_scale=0.5 +kl_fst_scale=0.5 +unsup_egs_opts="" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +frame_subsampling_factor=1 +if [ -f $src_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $src_dir/frame_subsampling_factor) || exit 1 +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${supervised_set}_16kHz_${tgt_dataset}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_16kHz_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_egs_opts="$unsup_egs_opts --deriv-weights-scp $deriv_weights_scp --add-numerator-post true" + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${tgt_dataset}_sp + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam 4.0 --kl-latdir $unsup_lat_dir --kl-fst-scale $kl_fst_scale \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${supervised_set}_16kHz_${tgt_dataset}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + $student_data_dir $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd --mem 4G" --train-queue-opt "--h-rt 00:15:00" \ + --combine-queue-opt "--h-rt 00:50:00" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${supervised_set}_16kHz_${tgt_dataset}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $unsup_egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$dir/egs_comb" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 21 ]; then + rm -f $dir/.error + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_subset_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_subset_1a.sh new file mode 100755 index 00000000000..131ea0c8ec9 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_subset_1a.sh @@ -0,0 +1,424 @@ +#!/bin/bash + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +supervised_data_dir=data/ami_sdm1_train_reco12 +unsupervised_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +student_rescore_lang=data/lang_pp_test_fg + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +supervision_weights=1.0,1.0 +num_copies=1,1 +lm_weights=1,1 + +tdnn_affix=_1a +chain_affix=_semisup_ts_ami_subset_sdm1 +nnet3_affix=_semisup_ts_ami_subset_sdm1 + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +lattice_lm_scale=0.5 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set}_sp + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $unsup_student_data_dir) \ + --generate-egs-scp true \ + $sup_student_data_dir $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_frames_per_eg= +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam 4.0 \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $unsup_student_data_dir) \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_sp_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $unsup_student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $unsup_student_data_dir \ + --tree-dir $treedir \ + --lat-dir $unsup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 21 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$dset \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_wt_1b.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_wt_1b.sh new file mode 100755 index 00000000000..ce2cf81516d --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_wt_1b.sh @@ -0,0 +1,430 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +test_stage=0 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +extractor=exp/nnet3/extractor + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp + +norvb_unsupervised_set=train +unsupervised_set=train_rvb + +tdnn_affix=_1b +chain_affix=_semisup_wt + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=1 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +lm_weights=1,3 # 3 - To compensate for using alignments before reverberation. +supervision_weights=1.0,1.0 +num_copies=2,1 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $unsup_lat_dir/uttlist.$n.$nj + #done + + rm -f $unsup_lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_unsup_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $unsup_lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_unsup_lat_dir/lat.JOB.gz |" \ + ark,scp:$unsup_lat_dir/lat_tmp.JOB.ark,$unsup_lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $unsup_lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $unsup_lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $unsup_lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${unsupervised_set}_hires/split$nj/JOB/utt2spk $unsup_lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $unsup_lat_dir/lat.JOB.gz" || exit 1 + + rm $unsup_lat_dir/lat_tmp.* $unsup_lat_dir/lat_rvb.scp + + echo $nj > $unsup_lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_unsup_lat_dir/$f ]; then cp $norvb_unsup_lat_dir/$f $unsup_lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $unsup_lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_unsup_lat_dir} \ + $src_dir/best_path_${norvb_unsupervised_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_unsupervised_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${unsupervised_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${unsupervised_set}/weights.scp +fi + +deriv_weights_scp=$src_dir/best_path_${unsupervised_set}/weights.scp + +if [ $stage -le 14 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${src_dir}/best_path_${norvb_unsupervised_set} $dir +fi + +# if [ $stage -le 15 ]; then +# echo "$0: creating neural net configs using the xconfig parser"; +# +# num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') +# learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) +# +# lstm_opts="decay-time=40" +# +# mkdir -p $dir/configs +# cat < $dir/configs/network.xconfig +# input dim=100 name=ivector +# input dim=40 name=input +# +# # please note that it is important to have input layer with the name=input +# # as the layer immediately preceding the fixed-affine-layer to enable +# # the use of short notation for the descriptor +# fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat +# +# # the first splicing is moved before the lda layer, so no splicing here +# relu-batchnorm-layer name=tdnn1 dim=$hidden_dim +# relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim +# +# fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# +# ## adding the layers for chain branch +# output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 +# +# +# # adding the layers for xent branch +# # This block prints the configs for a separate output that will be +# # trained with a cross-entropy objective in the 'chain' models... this +# # has the effect of regularizing the hidden parts of the model. we use +# # 0.5 / args.xent_regularize as the learning rate factor- the factor of +# # 0.5 / args.xent_regularize is suitable as it means the xent +# # final-layer learns at a rate independent of the regularization +# # constant; and the 0.5 was tuned so as to make the relative progress +# # similar in the xent and regular final layers. +# output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +# +# output name=output-0 input=output.affine@$label_delay skip-in-init=true +# output name=output-1 input=output.affine@$label_delay skip-in-init=true +# +# output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +# output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +# EOF +# steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +# fi + +. $src_dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $(dirname $extractor)/ivectors_${supervised_set} \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale 0.5 \ + --lattice-prune-beam 4.0 \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -1 ]; then + train_stage=-1 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" --trainer.input-model $src_dir/final.mdl \ + --feat.online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir $dir/egs_comb \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 21 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_wt_ami_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_wt_ami_1a.sh new file mode 100755 index 00000000000..8779918f3c0 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_wt_ami_1a.sh @@ -0,0 +1,297 @@ +#!/bin/bash + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +student_rescore_lang=data/lang_pp_test_fg + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +tdnn_affix=_1b +chain_affix=_semisup_wt_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.5,0.5" +mmi_factor_schedule="output=0.5,0.5" + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +primary_lr_factor=0.25 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +# decode options +test_sets="ami_sdm1_dev ami_sdm1_eval" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig +# input dim=100 name=ivector +# input dim=40 name=input +# +# # please note that it is important to have input layer with the name=input +# # as the layer immediately preceding the fixed-affine-layer to enable +# # the use of short notation for the descriptor +# fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat +# +# # the first splicing is moved before the lda layer, so no splicing here +# relu-batchnorm-layer name=tdnn1 dim=$hidden_dim +# relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim +# +# fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# +# ## adding the layers for chain branch +# output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 +# +# # adding the layers for xent branch +# # This block prints the configs for a separate output that will be +# # trained with a cross-entropy objective in the 'chain' models... this +# # has the effect of regularizing the hidden parts of the model. we use +# # 0.5 / args.xent_regularize as the learning rate factor- the factor of +# # 0.5 / args.xent_regularize is suitable as it means the xent +# # final-layer learns at a rate independent of the regularization +# # constant; and the 0.5 was tuned so as to make the relative progress +# # similar in the xent and regular final layers. +# output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +# +# output name=output-0 input=output.affine@$label_delay skip-in-init=true +# output name=output-1 input=output.affine@$label_delay skip-in-init=true +# +# output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +# output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +#EOF +# steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +#fi + +if [ $stage -le 12 ]; then + # Set the learning-rate-factor for all transferred layers but the last output + # layer to primary_lr_factor. + $train_cmd $dir/log/generate_input_mdl.log \ + nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ + $src_dir/final.mdl $dir/input.raw || exit 1; +fi + +egs_opts="--kl-fst-scale 0.5 --lattice-lm-scale 0.5 --lattice-prune-beam 4.0 --deriv-weights-scp $deriv_weights_scp --add-numerator-post true" + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" --trainer.input-model $dir/input.raw \ + --feat.online-ivector-dir $teacher_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.mmi-factor-schedule=$mmi_factor_schedule \ + --chain.kl-factor-schedule=$kl_factor_schedule \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $teacher_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $(dirname $src_ivector_extractor)/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_wgt_semisup_wt_1b.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_wgt_semisup_wt_1b.sh new file mode 100755 index 00000000000..94e6b1b0f96 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_wgt_semisup_wt_1b.sh @@ -0,0 +1,430 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +test_stage=0 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +extractor=exp/nnet3/extractor + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp + +norvb_unsupervised_set=train +unsupervised_set=train_rvb + +tdnn_affix=_1b +chain_affix=_semisup_ts + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +lm_weights=1,3 # 3 - To compensate for using alignments before reverberation. +supervision_weights=1.0,1.0 +num_copies=2,1 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $unsup_lat_dir/uttlist.$n.$nj + #done + + rm -f $unsup_lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_unsup_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $unsup_lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_unsup_lat_dir/lat.JOB.gz |" \ + ark,scp:$unsup_lat_dir/lat_tmp.JOB.ark,$unsup_lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $unsup_lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $unsup_lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $unsup_lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${unsupervised_set}_hires/split$nj/JOB/utt2spk $unsup_lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $unsup_lat_dir/lat.JOB.gz" || exit 1 + + rm $unsup_lat_dir/lat_tmp.* $unsup_lat_dir/lat_rvb.scp + + echo $nj > $unsup_lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_unsup_lat_dir/$f ]; then cp $norvb_unsup_lat_dir/$f $unsup_lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $unsup_lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_unsup_lat_dir} \ + $src_dir/best_path_${norvb_unsupervised_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_unsupervised_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${unsupervised_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${unsupervised_set}/weights.scp +fi + +deriv_weights_scp=$src_dir/best_path_${unsupervised_set}/weights.scp + +if [ $stage -le 14 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${src_dir}/best_path_${norvb_unsupervised_set} $dir +fi + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $(dirname $extractor)/ivectors_${supervised_set} \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale 0.5 \ + --lattice-prune-beam 4.0 \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir $dir/egs_comb \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 21 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/semisup/copy_lat_dir.sh b/egs/aspire/s5/local/semisup/copy_lat_dir.sh new file mode 100755 index 00000000000..6aefd24a0b8 --- /dev/null +++ b/egs/aspire/s5/local/semisup/copy_lat_dir.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +utt_prefixes= +max_jobs_run=30 +nj=100 +cmd=queue.pl +write_compact=true + +. ./path.sh +. utils/parse_options.sh + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi + +data=$1 +src_dir=$2 +dir=$3 + +mkdir -p $dir + +num_jobs=$(cat $src_dir/num_jobs) + +rm -f $dir/lat_tmp.*.{ark,scp} 2>/dev/null + +# Copy the lattices temporarily +$cmd --max-jobs-run $max_jobs_run JOB=1:$num_jobs $dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=$write_compact \ + "ark:gunzip -c $src_dir/lat.JOB.gz |" \ + ark,scp:$dir/lat_tmp.JOB.ark,$dir/lat_tmp.JOB.scp || exit 1 + +# Make copies of utterances for perturbed data +for p in $utt_prefixes; do + cat $dir/lat_tmp.*.scp | awk -v p=$p '{print p$0}' +done | sort -k1,1 > $dir/lat_out.scp + +utils/split_data.sh ${data} $nj + +# Copy and dump the lattices for perturbed data +$cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/copy_out_lattices.JOB.log \ + lattice-copy --write-compact=$write_compact \ + "scp:utils/filter_scp.pl ${data}/split$nj/JOB/utt2spk $dir/lat_out.scp |" \ + "ark:| gzip -c > $dir/lat.JOB.gz" || exit 1 + +rm $dir/lat_tmp.* $dir/lat_out.scp + +echo $nj > $dir/num_jobs + +for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $src_dir/$f ]; then cp $src_dir/$f $dir/$f; fi +done diff --git a/egs/aspire/s5/local/semisup/nnet3/run_student_ivector_common.sh b/egs/aspire/s5/local/semisup/nnet3/run_student_ivector_common.sh new file mode 100755 index 00000000000..b3cb3d1ec1a --- /dev/null +++ b/egs/aspire/s5/local/semisup/nnet3/run_student_ivector_common.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +nnet3_affix= +stage=1 + +orig_data_dir=data/ami_sdm1_train_sp_hires +student_data_dir=data/ami_sdm1_train_16kHz_sp_hires +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +num_threads_ubm=16 +nj=40 + +echo "$0 $@" # Print the command line for logging + +. ./path.sh +. ./cmd.sh + +set -e -o pipefail -u + +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0" + exit 1 +fi + +if [ $stage -le 1 ]; then + if [ -f $student_data_dir/feats.scp ]; then + echo "$0: $student_data_dir/feats.scp exists. Remove it and skip this stage." + exit 1 + fi + + utils/copy_data_dir.sh $orig_data_dir $student_data_dir + + steps/make_mfcc.sh --mfcc-config $student_mfcc_config --cmd "$train_cmd" --nj $nj \ + $student_data_dir + steps/compute_cmvn_stats.sh $student_data_dir + utils/fix_data_dir.sh $student_data_dir +fi + +if [ $stage -le 2 ]; then + for dset in $test_sets; do + utils/copy_data_dir.sh data/${dset} data/${dset}_hires + steps/make_mfcc.sh --mfcc-config $student_mfcc_config --cmd "$train_cmd" --nj $nj \ + data/${dset}_hires + steps/compute_cmvn_stats.sh data/${dset}_hires + utils/fix_data_dir.sh data/${dset}_hires + done +fi + +if [ $stage -le 3 ]; then + steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + --max-utts 30000 --subsample 2 \ + $student_data_dir exp/nnet3${nnet3_affix}/pca_transform +fi + +if [ $stage -le 4 ]; then + num_utts=$(cat $student_data_dir/utt2spk | wc -l) + suffix= + if [ $num_utts -gt 30000 ]; then + utils/subset_data_dir.sh $student_data_dir 30000 ${student_data_dir}_30k + suffix=_30k + fi + + # To train a diagonal UBM we don't need very much data, so use the smallest + # subset. + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj $nj \ + --num-frames 400000 --num-threads $num_threads_ubm \ + ${student_data_dir}${suffix} 512 exp/nnet3${nnet3_affix}/pca_transform \ + exp/nnet3${nnet3_affix}/diag_ubm +fi + +if [ $stage -le 5 ]; then + num_utts=$(cat $student_data_dir/utt2spk | wc -l) + suffix= + if [ $num_utts -gt 100000 ]; then + utils/subset_data_dir.sh $student_data_dir 100000 ${student_data_dir}_100k + suffix=_100k + fi + # iVector extractors can in general be sensitive to the amount of data, but + # this one has a fairly small dim (defaults to 100) so we don't use all of it, + # we use just the 100k subset (about one sixteenth of the data). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj $nj \ + ${student_data_dir}${suffix} exp/nnet3${nnet3_affix}/diag_ubm \ + exp/nnet3${nnet3_affix}/extractor || exit 1; +fi + +if [ $stage -le 6 ]; then + utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \ + $student_data_dir ${student_data_dir}_max2 + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ + ${student_data_dir}_max2 exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) + + for dset in $test_sets; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ + data/${dset}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_$dset + done +fi diff --git a/egs/aspire/s5/local/semisup/run_300k.sh b/egs/aspire/s5/local/semisup/run_300k.sh new file mode 100644 index 00000000000..08f392bd8f1 --- /dev/null +++ b/egs/aspire/s5/local/semisup/run_300k.sh @@ -0,0 +1,18 @@ +false && { +local/fisher_train_lms_pocolm.sh --text data/train_300k_dev/text --lexicon data/local/dict/lexicon.txt --dir data/local/pocolm_300k --num-ngrams-large 250000 + +local/fisher_create_test_lang.sh --arpa-lm data/local/pocolm_300k/data/arpa/4gram_big.arpa.gz --lang data/lang_300k_pp --dir data/lang_300k_pp_test + +local/semisup/build_silprob.sh +} + +mkdir -p data/lang_300k_pp_ug_test + +oov=`cat data/lang_300k_pp/oov.int` || exit 1; +cp -rT data/lang_300k_pp data/lang_300k_pp_ug_test + +cat data/train_300k_dev/text | utils/sym2int.pl --map-oov $oov -f 2- data/lang_300k_pp/words.txt | \ + awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \ + utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > data/lang_300k_pp_ug_test/G.fst \ + || exit 1; + diff --git a/egs/aspire/s5/path.sh b/egs/aspire/s5/path.sh index 7fb6d91c543..6beff863a46 100755 --- a/egs/aspire/s5/path.sh +++ b/egs/aspire/s5/path.sh @@ -4,3 +4,5 @@ export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH . $KALDI_ROOT/tools/config/common_path.sh export PATH=$KALDI_ROOT/tools/sctk/bin:$PATH export LC_ALL=C +. /etc/profile.d/modules.sh +module load shared cuda80/toolkit diff --git a/egs/aspire/s5/run.sh b/egs/aspire/s5/run.sh index 851363a7532..5c490ac6fb0 100755 --- a/egs/aspire/s5/run.sh +++ b/egs/aspire/s5/run.sh @@ -15,7 +15,8 @@ set -e # Set this to somewhere where you want to put your aspire data, or where # someone else has already put it. You'll want to change this # if you're not on the CLSP grid. -aspire_data=/export/corpora/LDC/LDC2017S21/IARPA-ASpIRE-Dev-Sets-v2.0/data # JHU +# aspire_data=/export/corpora/LDC/LDC2017S21/IARPA-ASpIRE-Dev-Sets-v2.0/data # JHU +aspire_data=/export/common/data/corpora/LDC/LDC2017S21 # COE # the next command produces the data in local/train_all local/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \ diff --git a/egs/babel/s5d/local/chain/run_ivector_common.sh b/egs/babel/s5d/local/chain/run_ivector_common.sh index a1a145564d0..060c775d5d5 100755 --- a/egs/babel/s5d/local/chain/run_ivector_common.sh +++ b/egs/babel/s5d/local/chain/run_ivector_common.sh @@ -16,9 +16,10 @@ gmm=tri5_cleaned # This specifies a GMM-dir from the features # of the type you're training the system on; # it should contain alignments for 'train_set'. langdir=data/langp/tri5_ali - +generate_alignments=true num_threads_ubm=12 nnet3_affix=_cleaned +extractor= . ./cmd.sh . ./path.sh @@ -57,7 +58,7 @@ if [ $stage -le 1 ]; then utils/fix_data_dir.sh data/${train_set}_sp fi -if [ $stage -le 2 ]; then +if $generate_alignments && [ $stage -le 2 ]; then echo "$0: aligning with the perturbed low-resolution data" steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 @@ -93,53 +94,55 @@ if [ $stage -le 3 ]; then steps/compute_cmvn_stats.sh \ data/${datadir}_hires_nopitch exp/make_hires/${datadir}_nopitch $mfccdir || exit 1; utils/fix_data_dir.sh data/${datadir}_hires_nopitch - done fi -if [ $stage -le 4 ]; then - echo "$0: computing a subset of data to train the diagonal UBM." - - mkdir -p exp/nnet3${nnet3_affix}/diag_ubm - temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm - - # train a diagonal UBM using a subset of about a quarter of the data - # we don't use the _comb data for this as there is no need for compatibility with - # the alignments, and using the non-combined data is more efficient for I/O - # (no messing about with piped commands). - num_utts_total=$(wc -l 3273 combine=-0.204->-0.179 +# xent:train/valid[31,47,final]=(-2.35,-1.89,-1.86/-2.49,-2.19,-2.17) +# logprob:train/valid[31,47,final]=(-0.199,-0.158,-0.154/-0.236,-0.221,-0.222) +# 206-zulu | %WER 52.2 | 22805 52162 | 51.6 38.2 10.2 3.8 52.2 30.7 | -0.629 | exp/chain_cleaned/tdnn_lstm_bab7_sp/decode_dev10h.pem/score_11/dev10h.pem.ctm.sys +# num-iters=66 nj=2..12 num-params=36.7M dim=43+100->3274 combine=-0.237->-0.215 +# xent:train/valid[43,65,final]=(-2.42,-1.96,-1.94/-2.53,-2.25,-2.24) +# logprob:train/valid[43,65,final]=(-0.239,-0.188,-0.186/-0.279,-0.267,-0.266) +# 104-pashto | %WER 40.2 | 21825 101803 | 63.8 25.8 10.4 3.9 40.2 29.8 | -0.438 | exp/chain_cleaned/tdnn_lstm_bab7_sp/decode_dev10h.pem/score_10/dev10h.pem.ctm.sys +# num-iters=85 nj=2..12 num-params=36.8M dim=43+100->3328 combine=-0.203->-0.189 +# xent:train/valid[55,84,final]=(-2.27,-1.81,-1.79/-2.46,-2.18,-2.17) +# logprob:train/valid[55,84,final]=(-0.213,-0.166,-0.163/-0.264,-0.249,-0.250) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +dropout_schedule='0,0@0.20,0.3@0.50,0' +train_set=train_cleaned +gmm=tri5_cleaned # the gmm for the target data +langdir=data/langp/tri5_ali +num_threads_ubm=12 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +num_epochs=4 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix="_bab9" #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. +chunk_width=150,120,90,75 +chunk_left_context=40 + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + $langdir $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 +if [ $stage -le 17 ]; then + mkdir -p $dir + + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + lstm_opts="decay-time=20 dropout-proportion=0.0" + label_delay=5 + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=fastlstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=fastlstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=fastlstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage + fi + [ ! -d $dir/egs ] && mkdir -p $dir/egs/ + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context 0 \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph +fi + +exit 0 diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_semisup_1a.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_semisup_1a.sh new file mode 100755 index 00000000000..5a509a1d14a --- /dev/null +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_semisup_1a.sh @@ -0,0 +1,423 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 + +supervised_set=train_cleaned +unsupervised_set=train_unt.asr_seg_1a + +srcdir=exp/chain_cleaned/tdnn_lstm_bab9_2_nepochs10_h512_sp +treedir=exp/chain_cleaned/tree +src_extractor=exp/nnet3_cleaned/extractor +sup_lat_dir=exp/chain_cleaned/tri5_cleaned_train_cleaned_sp_lats + +nnet3_affix=_cleaned_semisup +chain_affix=_cleaned_semisup + +frames_per_eg=150,120,90,75 + +# Unsupervised options +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +smbr_leaky_hmm_coefficient=0.00001 +mmi_factor_schedule="output-0=1.0,1.0 output-1=1.0,1.0" +smbr_factor_schedule="output-0=0.0,0.0 output-1=0.0,0.0" + +# Semi-supervised options +affix=_semisup_1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,1 +num_copies=2,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +num_threads_ubm=12 +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat < $srcdir/best_path_${unsupervised_set}_sp/frame_subsampling_factor +fi + +cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1 + +sup_ali_dir=exp/tri5_cleaned + +diff $treedir/tree $srcdir/tree || { echo "$0: $treedir/tree and $srcdir/tree differ"; exit 1; } + +dir=exp/chain${chain_affix}/tdnn_lstm${affix}_sp + +if [ $stage -le 14 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${srcdir}/best_path_${unsupervised_set}_sp \ + $dir +fi + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${srcdir}/decode_${unsupervised_set} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set} + + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --deriv-weights-scp $srcdir/best_path_${unsupervised_set}/weights.scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/comb_egs + +if [ $stage -le 18 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ + --chain.mmi-factor-schedule="$mmi_factor_schedule" \ + --chain.smbr-factor-schedule="$smbr_factor_schedule" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +wait; +exit 0; diff --git a/egs/babel/s5d/local/datasets/unsupervised_asr_seg.sh b/egs/babel/s5d/local/datasets/unsupervised_asr_seg.sh new file mode 100644 index 00000000000..0b84e94a15e --- /dev/null +++ b/egs/babel/s5d/local/datasets/unsupervised_asr_seg.sh @@ -0,0 +1,93 @@ +#This script is not really supposed to be run directly +#Instead, it should be sourced from the decoding script +#It makes many assumption on existence of certain environmental +#variables as well as certain directory structure. +if [ ${dataset_type} != "supervised" ] ; then + mandatory_variables="my_data_dir my_data_list my_nj" + optional_variables="" +else + mandatory_variables="my_data_dir my_data_list my_nj" + optional_variables="my_stm_file" +fi + +check_variables_are_set + +decode_opts="--extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0" +sad_nnet_dir=exp/segmentation_1a/tdnn_lstm_asr_sad_1a + +workdir=exp/make_seg/${dataset_id} +unseg_dir=$workdir +mkdir -p $unseg_dir +# 4. Create the wav.scp file: +sph2pipe=`which sph2pipe || which $KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe` +if [ $? -ne 0 ] ; then + echo "Could not find sph2pipe binary. Add it to PATH" + exit 1; +fi +sox=`which sox` +if [ $? -ne 0 ] ; then + echo "Could not find sox binary. Add it to PATH" + exit 1; +fi + +echo "Creating the $unseg_dir/wav.scp file" +audiodir=$my_data_dir/audio +for file in `cat $my_data_list | sort -u` ; do + if [ -f $audiodir/$file.sph ] ; then + echo "$file $sph2pipe -f wav -p -c 1 $audiodir/$file.sph |" + elif [ -f $audiodir/$file.wav ] ; then + echo "$file $sox $audiodir/$file.wav -r 8000 -c 1 -b 16 -t wav - downsample |" + else + echo "Audio file $audiodir/$file.(sph|wav) does not exist!" >&2 + exit 1 + fi +done | sort -u > $unseg_dir/wav.scp + +l1=`cat $unseg_dir/wav.scp | wc -l ` +l2=`cat $my_data_list | wc -l ` +if [ "$l1" -ne "$l2" ] ; then + echo "wav.scp number of files: $l1" + echo "filelist number of files: $l2" + echo "Not all files from the list $my_data_list found their way into wav.scp" + exit 1 +fi + +echo "Creating the $unseg_dir/reco2file_and_channel file" +cat $unseg_dir/wav.scp | awk '{print $1, $1, "A";}' > $unseg_dir/reco2file_and_channel +cat $unseg_dir/wav.scp | awk '{print $1, $1;}' > $unseg_dir/utt2spk +utils/utt2spk_to_spk2utt.pl $unseg_dir/utt2spk > $unseg_dir/spk2utt + +steps/segmentation/detect_speech_activity.sh \ + $decode_opts \ + --nj $my_nj --acwt 0.3 \ + --mfcc-config conf/mfcc_hires_bp.conf \ + $unseg_dir \ + $sad_nnet_dir mfcc_hires_bp \ + $sad_nnet_dir $sad_nnet_dir/${dataset_id} + +utils/copy_data_dir.sh $sad_nnet_dir/${dataset_id}_seg $dataset_dir + +num_hours=`cat ${dataset_dir}/segments | \ + awk '{secs+= $4-$3;} END{print(secs/3600);}'` + +echo "Number of hours of the newly segmented data: $num_hours" + +if [ "$dataset_kind" == "supervised" ]; then + echo --------------------------------------------------------------------- + echo "preparing ${dataset_id} stm files in ${dataset_dir} on" `date` + echo --------------------------------------------------------------------- + if [ ! -z $my_stm_file ] ; then + local/augment_original_stm.pl $my_stm_file ${dataset_dir} + else + local/prepare_stm.pl --fragmentmarkers \-\*\~ ${dataset_dir} + fi +else + echo --------------------------------------------------------------------- + echo "preparing ${dataset_id} stm files in ${dataset_dir} on" `date` + echo --------------------------------------------------------------------- + if [ ! -z $my_stm_file ] ; then + local/augment_original_stm.pl $my_stm_file ${dataset_dir} + fi +fi + + diff --git a/egs/babel/s5d/local/run_asr_segmentation.sh b/egs/babel/s5d/local/run_asr_segmentation.sh index f70775526b6..40da4a4b50d 100755 --- a/egs/babel/s5d/local/run_asr_segmentation.sh +++ b/egs/babel/s5d/local/run_asr_segmentation.sh @@ -148,6 +148,7 @@ if [ $stage -le 6 ]; then --extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 \ --extra-left-context-initial 0 --extra-right-context-final 0 \ --nj $test_nj --acwt 0.3 --stage $test_stage \ + --mfcc-config conf/mfcc_hires_bp.conf \ data/dev10h.pem \ exp/segmentation_1a/tdnn_lstm_asr_sad_1a \ mfcc_hires_bp \ diff --git a/egs/babel/s5d/run-4-anydecode.sh b/egs/babel/s5d/run-4-anydecode.sh index 52c997ae26a..18a6412ccf7 100755 --- a/egs/babel/s5d/run-4-anydecode.sh +++ b/egs/babel/s5d/run-4-anydecode.sh @@ -242,7 +242,9 @@ if [ ! -f $dataset_dir/.done ] ; then elif [ "$dataset_kind" == "unsupervised" ] ; then if [ "$dataset_segments" == "seg" ]; then . ./local/datasets/unsupervised_seg.sh - elif [[ $dataset_segments =~ *seg* ]]; then + elif [[ $dataset_segments =~ asr_seg* ]]; then + . ./local/datasets/unsupervised_asr_seg.sh + elif [[ $dataset_segments =~ seg* ]]; then . ./local/datasets/unsupervised_seg.sh elif [ "$dataset_segments" == "uem" ] ; then . ./local/datasets/unsupervised_uem.sh @@ -555,14 +557,16 @@ if [ -f exp/$chain_model/final.mdl ]; then my_nj_backup=$my_nj rnn_opts= if [ "$is_rnn" == "true" ]; then - rnn_opts=" --extra-left-context $extra_left_context --extra-right-context $extra_right_context --frames-per-chunk $frames_per_chunk " + rnn_opts=" --extra-left-context $extra_left_context --extra-right-context $extra_right_context --frames-per-chunk $frames_per_chunk --extra-left-context-initial 0 --extra-right-context-final 0" echo "Modifying the number of jobs as this is an RNN and decoding can be extremely slow." my_nj=`cat ${dataset_dir}_hires/spk2utt|wc -l` + if [ $my_nj -gt $my_nj_backup ]; then + my_nj=$my_nj_backup + fi fi if [ ! -f $decode/.done ]; then mkdir -p $decode echo "Modifying the number of jobs as this is an RNN and decoding can be extremely slow." - my_nj=`cat ${dataset_dir}_hires/spk2utt|wc -l` $decode_script --nj $my_nj --cmd "$decode_cmd" $rnn_opts \ --acwt 1.0 --post-decode-acwt 10.0 \ --beam $dnn_beam --lattice-beam $dnn_lat_beam \ diff --git a/egs/fisher_english/s5/cmd.sh b/egs/fisher_english/s5/cmd.sh index 88db78823a5..44ec34bcd61 100644 --- a/egs/fisher_english/s5/cmd.sh +++ b/egs/fisher_english/s5/cmd.sh @@ -11,5 +11,6 @@ # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. export train_cmd="queue.pl --mem 4G" -export decode_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 8G" export mkgraph_cmd="queue.pl --mem 8G" +export tfrnnlm_cmd="queue.pl -l hostname=b*" # this is specific to the CLSP grid diff --git a/egs/fisher_english/s5/local/chain/compare_wer_general.py b/egs/fisher_english/s5/local/chain/compare_wer_general.py new file mode 100755 index 00000000000..e6dc33779eb --- /dev/null +++ b/egs/fisher_english/s5/local/chain/compare_wer_general.py @@ -0,0 +1,244 @@ +#! /usr/bin/env python + +import argparse +import collections +import os +import re +import sys + +sys.path.insert(0, 'steps') +import libs.common as common_lib + +from collections import defaultdict + +def get_args(): + parser = argparse.ArgumentParser( + description=""" +This script is used for comparing decoding results between systems. +e.g. local/chain/compare_wer_general.py exp/chain_cleaned/tdnn_{c,d}_sp +For use with discriminatively trained systems you specify the epochs after a colon: +for instance, +local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_c_sp exp/chain_cleaned/tdnn_c_sp_smbr:{1,2,3} +""") + + parser.add_argument("--separator", type=str, default=" ", + help="Separator for different fields") + parser.add_argument("--print-fine-details", action='store_true', + help="Add fine details of insertions, substitutions " + "and deletions.") + parser.add_argument("--include-looped", action='store_true', + help="Used to include looped results") + parser.add_argument("--field-size", type=int, + help="Field size for the models") + parser.add_argument("--outputs", type=str, default="output", + help="Comma separated list of output-names") + parser.add_argument("systems", nargs='+') + + args = parser.parse_args() + return args + + +def parse_system_string(system_string): + parts = system_string.split(":") + if len(parts) not in [1, 2, 3]: + raise RuntimeError("Unable to parse system string {0}" + "".format(system_string)) + + dir_name = parts[0] + + suffix = "" + if len(parts) > 1: + suffix = parts[1] + + model_name = os.path.basename(dir_name) + if len(parts) > 2: + model_name = parts[2] + + return (dir_name, suffix, model_name) + + +class SystemInfo(object): + def __init__(self, dir_name, suffix, model_name): + self.dir_name = dir_name + self.suffix = suffix + self.model_name = model_name + self.iter_ = "final" + + if self.suffix != "": + m = re.search("_iter(\d+)", suffix) + if bool(m): + self.iter_ = m.group(1) + else: + used_epochs = False + + self.probs = defaultdict(list) + self.wers = defaultdict(lambda: "NA") + self.ins = defaultdict(lambda: "NA") + self.dels = defaultdict(lambda: "NA") + self.sub = defaultdict(lambda: "NA") + + def add_wer(self, dev_set, is_looped=False): + decode_name = dev_set + self.suffix + + out = common_lib.get_command_stdout( + "grep WER {dir_name}/decode*_{decode_name}/wer* | grep {looped_filter} looped | utils/best_wer.sh" + "".format(dir_name=self.dir_name, decode_name=decode_name, + looped_filter="-v" if not is_looped else ""), + require_zero_status=False) + + affix = "looped" if is_looped else "" + if out != "" and len(out.split()) >= 2: + self.wers[(dev_set, affix)] = out.split()[1] + self.ins[(dev_set, affix)] = out.split()[6] + self.dels[(dev_set, affix)] = out.split()[8] + self.sub[(dev_set, affix)] = out.split()[10] + + def _get_prob(self, output_name="output", set_="train", xent=False): + + if not os.path.exists( + "{dir_name}/log/compute_prob_{set}.{iter}.log" + "".format(dir_name=self.dir_name, set=set_, iter=self.iter_)): + return "NA" + + out = common_lib.get_command_stdout( + "grep Overall {dir_name}/log/compute_prob_{set}.{iter}.log | " + "grep {opt} xent".format(dir_name=self.dir_name, set=set_, + iter=self.iter_, + opt="-w" if xent else "-v"), + require_zero_status=False) + + if out == "": + return "NA" + + lines = out.split("\n") + prob = None + + affix = "-xent" if xent else "" + for line in lines: + if bool(re.search(r"'{0}{1}'".format(output_name, affix), line)): + prob = float(line.split()[7]) + break + + return "NA" if prob is None else "{0:.4f}".format(prob) + + def add_probs(self, output_name="output"): + self.probs[output_name].append(self._get_prob(output_name=output_name, set_="train", xent=False)) + self.probs[output_name].append(self._get_prob(output_name=output_name, set_="valid", xent=False)) + self.probs[output_name].append(self._get_prob(output_name=output_name, set_="train", xent=True)) + self.probs[output_name].append(self._get_prob(output_name=output_name, set_="valid", xent=True)) + + +def run(args): + used_epochs = False + systems = [] + for sys_string in args.systems: + dir_name, suffix, model_name = parse_system_string(sys_string) + info = SystemInfo(dir_name, suffix, model_name) + + if suffix != "" and re.search(suffix, "epoch"): + used_epochs = True + else: + used_epochs = False + + for dev_set in ["dev", "test"]: + info.add_wer(dev_set) + + if args.include_looped: + info.add_wer(dev_set, is_looped=True) + + if not used_epochs: + for output_name in args.outputs.split(','): + info.add_probs(output_name) + + systems.append(info) + + print_system_infos(args, systems, used_epochs) + + +def print_system_infos(args, system_infos, used_epochs=False): + field_sizes = [args.field_size] * len(system_infos) + output_names = args.outputs.split(",") + + if args.field_size is None: + for i, x in enumerate(system_infos): + field_sizes[i] = len(x.model_name) + + separator = args.separator + print ("# {0: <35}{sep}{1}".format( + "System", + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.model_name, field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + tups = set() + for sys_info in system_infos: + for tup in sys_info.wers: + tups.add(tup) + + for tup in sorted(list(tups)): + dev_set, affix = tup + print ("# {0: <35}{sep}{1}".format( + "WER on {0} {1}" + "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.wers[tup], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + if args.print_fine_details: + print ("# {0: <35}{sep}{1}".format( + "#Ins on {0} {1}" + "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.ins[tup], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + print ("# {0: <35}{sep}{1}".format( + "#Del on {0} {1}" + "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.dels[tup], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + print ("# {0: <35}{sep}{1}".format( + "#Sub on {0} {1}" + "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.sub[tup], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + if not used_epochs: + for output_name in output_names: + print ("# {0: <35}{sep}{1}".format( + "Final {0} train prob".format(output_name), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[output_name][0], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + print ("# {0: <35}{sep}{1}".format( + "Final {0} valid prob".format(output_name), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[output_name][1], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + print ("# {0: <35}{sep}{1}".format( + "Final {0} train prob (xent)".format(output_name), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[output_name][2], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + print ("# {0: <35}{sep}{1}".format( + "Final {0} valid prob (xent)".format(output_name), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[output_name][3], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + +if __name__ == "__main__": + args = get_args() + run(args) diff --git a/egs/fisher_english/s5/local/chain/confidence_calibration.sh b/egs/fisher_english/s5/local/chain/confidence_calibration.sh new file mode 100755 index 00000000000..34a487085aa --- /dev/null +++ b/egs/fisher_english/s5/local/chain/confidence_calibration.sh @@ -0,0 +1,88 @@ +#!/bin/bash +. cmd.sh +. path.sh + +chaindir=exp/chain_semi350k_conf/tdnn_xxsup1a_sp +arpa_gz=data/local/lm_ex250k/3gram-mincount/lm_unpruned.gz +graph_affix=_ex250k +decode_affix= +train_set=train_sup_5k_calib_train +dev_set=dev_sup_5k_calib_dev + +. utils/parse_options.sh + +set -euxo pipefail + +train_data=data/${train_set}_hires +dev_data=data/${dev_set}_hires + +decode_affix=${decode_affix}${graph_affix} +graphdir=$chaindir/graph${graph_affix} +train_caldir=$chaindir/decode_${train_set}${decode_affix}/confidence +dev_caldir=$chaindir/decode_${dev_set}${decode_affix}/confidence + +###### Data preparation, + +# Prepare filtering for excluding data from train-set (1 .. keep word, 0 .. exclude word), +# - only excludes from training-targets, the confidences are recalibrated for all the words, +word_filter=$(mktemp) +awk '{ keep_the_word = $1 !~ /^(\[.*\]|<.*>|%.*|!.*|-.*|.*-)$/; print $0, keep_the_word }' \ + $graphdir/words.txt >$word_filter + +# Calcualte the word-length, +word_length=$(mktemp) +awk '{if(r==0) { len_hash[$1] = NF-2; } + if(r==1) { if(len_hash[$1]) { len = len_hash[$1]; } else { len = -1 } + print $0, len; }}' \ + r=0 $graphdir/phones/align_lexicon.txt \ + r=1 $graphdir/words.txt \ + >$word_length + +# Extract unigrams, +unigrams=$(mktemp); steps/conf/parse_arpa_unigrams.py $graphdir/words.txt $arpa_gz $unigrams + +###### Paste the 'word-specific' features (first 4 columns have fixed position, more feature-columns can be added), +# Format: "word word_id filter length other_features" +word_feats=$(mktemp) +paste $word_filter <(awk '{ print $3 }' $word_length) <(awk '{ print $3 }' $unigrams) > $word_feats + + +###### Train the calibration, +steps/conf/train_calibration.sh --cmd "$decode_cmd" --lmwt 10 \ + $train_data $graphdir $word_feats \ + $chaindir/decode_${train_set}${decode_affix} $train_caldir + +###### Apply the calibration to eval set, +steps/conf/apply_calibration.sh --cmd "$decode_cmd" \ + $dev_data $graphdir $chaindir/decode_${dev_set}${decode_affix} \ + $train_caldir $dev_caldir +# The final confidences are here '$eval_caldir/ctm_calibrated', + +exit 0 + +###### Sclite scoring, +# We will produce NCE which shows the ``quality'' of the confidences. +# Please compare with the default scoring script for your database. + +# Scoring tools, +hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +hubdir=`dirname $hubscr` + +# Inputs, +ctm=$eval_caldir/ctm_calibrated +stm=$eval_data/stm +glm=$eval_data/glm + +# Normalizng CTM, just like in 'local/score_sclite.sh', +cat $ctm | grep -i -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \ + grep -i -v -E '' | \ + grep -i -v -E ' (UH|UM|EH|MM|HM|AH|HUH|HA|ER|OOF|HEE|ACH|EEE|EW) ' | \ + awk '$5 !~ /^.*-$/' | \ + local/map_acronyms_ctm.py -M data/local/dict_nosp/acronyms.map -i - -o ${ctm}.filt + +# Mapping the time info to global, +utils/convert_ctm.pl $eval_data/segments $eval_data/reco2file_and_channel <${ctm}.filt >${ctm}.filt.conv + +# Scoring, +$hubscr -p $hubdir -V -l english -h hub5 -g $glm -r $stm ${ctm}.filt.conv + diff --git a/egs/fisher_english/s5/local/chain/run_semisupervised.sh b/egs/fisher_english/s5/local/chain/run_semisupervised.sh new file mode 100755 index 00000000000..77ae92e49b6 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/run_semisupervised.sh @@ -0,0 +1,143 @@ +#!/bin/bash + +set -e -o pipefail + +stage=-2 +nj=30 +decode_nj=30 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k # affix relating train-set splitting proportion + +tdnn_affix=_sup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# combination options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +comb_affix=_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +unsup_egs_weight=1.0 +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +left_tolerance=2 +right_tolerance=2 +train_combined_opts="--num-epochs 4.5" +graph_affix= # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= +# to tune: +# frames_per_eg for unsupervised + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} + +if ! cuda-compiled; then + cat <$n1?$n2:$n1)) + num_archives=$[num_archives*3/2] + mkdir -p $comb_egs_dir/log + cp {$sup_egs_dir,$comb_egs_dir}/train_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/valid_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/combine.cegs + cp {$sup_egs_dir,$comb_egs_dir}/cmvn_opts + cp -r $sup_egs_dir/info $comb_egs_dir + echo $num_archives > $comb_egs_dir/info/num_archives + cat {$sup_egs_dir,$unsup_egs_dir}/info/num_frames | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/num_frames + cat {$sup_egs_dir,$unsup_egs_dir}/info/egs_per_archive | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/egs_per_archive + out_egs_list= + egs_list= + for n in $(seq $num_archives); do + [ -f $sup_egs_dir/cegs.$n.ark ] && egs_list="$egs_list $sup_egs_dir/cegs.$n.ark" + [ -f $unsup_egs_dir/cegs.$n.ark ] && egs_list="$egs_list $unsup_egs_dir/cegs.$n.ark" + out_egs_list="$out_egs_list ark:$comb_egs_dir/cegs.$n.ark" + done + srand=0 + $decode_cmd $comb_egs_dir/log/combine.log \ + nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list +fi + +if [ $stage -le 3 ]; then + echo "$0: training on the supervised+unsupervised subset" + # the train-set and gmm do not matter as we are providing the egs + local/chain/run_tdnn.sh --stage 12 --remove-egs false --train-set $supervised_set \ + --nnet3-affix $nnet3_affix \ + --tdnn-affix ${tdnn_affix}${decode_affix}${egs_affix}${comb_affix} \ + --common-egs-dir $comb_egs_dir $train_combined_opts +fi diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_b.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_b.sh new file mode 100644 index 00000000000..6254dd5d184 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_b.sh @@ -0,0 +1,198 @@ +#!/bin/bash +set -e + +# Based on run_tdnn_7b.sh in the fisher swbd recipe + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train +tree_affix= +nnet3_affix= +gmm=tri5a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 11000 $build_tree_train_data_dir $lang $build_tree_ali_dir $treedir || exit 1; +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_a.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_a.sh new file mode 100755 index 00000000000..c5e0401c3e5 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_a.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +set -e -o pipefail + +stage=-2 +nj=30 +decode_nj=30 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k # affix relating train-set splitting proportion + +tdnn_affix=_sup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# combination options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +comb_affix=_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +unsup_egs_weight=1.0 +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +left_tolerance=2 +right_tolerance=2 +train_combined_opts="--num-epochs 4.5" +graph_affix= # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= +# to tune: +# frames_per_eg for unsupervised + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} + +if ! cuda-compiled; then + cat <$n1?$n2:$n1)) + num_archives=$[num_archives*3/2] + mkdir -p $comb_egs_dir/log + cp {$sup_egs_dir,$comb_egs_dir}/train_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/valid_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/combine.cegs + cp {$sup_egs_dir,$comb_egs_dir}/cmvn_opts + cp -r $sup_egs_dir/info $comb_egs_dir + echo $num_archives > $comb_egs_dir/info/num_archives + cat {$sup_egs_dir,$unsup_egs_dir}/info/num_frames | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/num_frames + cat {$sup_egs_dir,$unsup_egs_dir}/info/egs_per_archive | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/egs_per_archive + out_egs_list= + egs_list= + for n in $(seq $num_archives); do + [ -f $sup_egs_dir/cegs.$n.ark ] && egs_list="$egs_list $sup_egs_dir/cegs.$n.ark" + [ -f $unsup_egs_dir/cegs.$n.ark ] && egs_list="$egs_list $unsup_egs_dir/cegs.$n.ark" + out_egs_list="$out_egs_list ark:$comb_egs_dir/cegs.$n.ark" + done + srand=0 + $decode_cmd $comb_egs_dir/log/combine.log \ + nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list +fi + +if [ $stage -le 3 ]; then + echo "$0: training on the supervised+unsupervised subset" + # the train-set and gmm do not matter as we are providing the egs + local/chain/run_tdnn.sh --stage 12 --remove-egs false --train-set $supervised_set \ + --nnet3-affix $nnet3_affix \ + --tdnn-affix ${tdnn_affix}${decode_affix}${egs_affix}${comb_affix} \ + --common-egs-dir $comb_egs_dir $train_combined_opts +fi diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_b.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_b.sh new file mode 100755 index 00000000000..0c12140c8c7 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_b.sh @@ -0,0 +1,253 @@ +#!/bin/bash + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=30 +decode_nj=30 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_ex250k +egs_affix=_prun2_lmwt0_tol2 # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +left_tolerance=2 +right_tolerance=2 +graph_affix= # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +common_egs_dir= + +# Semi-supervised options +comb_affix=_comb1b2 # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 + +decode_iter= +# to tune: +# frames_per_eg for unsupervised + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_sp_hires \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z "$decode_iter" ]; then + iter_opts=" --iter $decode_iter " + else + decode_iter=final + fi + + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output;" $dir/$decode_iter.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/$decode_iter.mdl $dir/${decode_iter}-output.mdl + + iter_opts=" --iter ${decode_iter}-output " + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_a.sh new file mode 100644 index 00000000000..4a0b5f1dd26 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_a.sh @@ -0,0 +1,331 @@ +#!/bin/bash + +# This script is the baseline with unsupervised egs in multilingual recipe. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: None +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 0 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $conf_dir/weights.scp +fi + +left_context=`cat $chaindir/egs/info/left_context` +right_context=`cat $chaindir/egs/info/right_context` +left_context_initial=`cat $chaindir/egs/info/left_context_initial` +right_context_final=`cat $chaindir/egs/info/right_context_final` + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` +frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +cmvn_opts=`cat $chaindir/cmvn_opts` + +unsup_egs_dir=$chaindir/unsup_egs${decode_affix}${egs_affix} + +if [ $stage -le 9 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $conf_dir/weights.scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + data/${unsupervised_set}_hires $chaindir \ + ${chaindir}/decode_${unsupervised_set}${decode_affix} $unsup_egs_dir +fi + +sup_egs_dir=$chaindir/egs_scp +comb_egs_dir=$chaindir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 10 ]; then + + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --samples-per-iter 10000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +treedir=exp/chain${nnet3_affix}/tree_${tree_affix} +lat_dir=exp/chain${nnet3_affix}/tri5a_${supervised_set}_sp_lats # not required since egs is given. +dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_sp_hires \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_c.sh new file mode 100644 index 00000000000..0564bf693ab --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_c.sh @@ -0,0 +1,380 @@ +#!/bin/bash + +# This script is similar to _a but uses denominator FST created using +# LM estimated on supervised + unsupervised set phone sequences +# and deriv weights from calibrated confidences. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: None +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=_comb1c # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $conf_dir/weights.scp +fi + +left_context=`cat $chaindir/egs/info/left_context` +right_context=`cat $chaindir/egs/info/right_context` +left_context_initial=`cat $chaindir/egs/info/left_context_initial` +right_context_final=`cat $chaindir/egs/info/right_context_final` + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +treedir=exp/chain${nnet3_affix}/tree_${tree_affix} +dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix} + +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +if [ $stage -le 9 ]; then + false && $decode_cmd JOB=1:$(cat $unsup_lat_dir/num_jobs) \ + ${chaindir}/best_path_${unsupervised_set}${decode_affix}/log/get_best_path.JOB.log \ + lattice-best-path --acoustic-scale=1.0 \ + "ark:gunzip -c $unsup_lat_dir/lat.JOB.gz |" ark:/dev/null \ + "ark:| gzip -c > ${chaindir}/best_path_${unsupervised_set}${decode_affix}/ali.JOB.gz" + + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +supervised_set=${supervised_set}_sp +sup_lat_dir=exp/chain${nnet3_affix}/tri5a_${supervised_set}_lats +sup_egs_dir=$dir/egs_${supervised_set} + +if [ $stage -le 10 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $(cat $chaindir/egs/info/frames_per_eg) \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir +fi + +unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $conf_dir/weights.scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $chaindir \ + $unsup_lat_dir $unsup_egs_dir +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 12 ]; then + + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --samples-per-iter 10000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $stage -le 13 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -2 ]; then + train_stage=-2 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_d.sh new file mode 100644 index 00000000000..572a3f8466e --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_d.sh @@ -0,0 +1,298 @@ +#!/bin/bash + +# This script is similar to _a but uses deriv weights from lattice-posteriors. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posteriors (Bug when originally run) +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 0 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1d # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_e.sh new file mode 100644 index 00000000000..24734d216e2 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_e.sh @@ -0,0 +1,396 @@ +#!/bin/bash + +# This script is similar to _c but re-creates supervised egs using new +# normalization FST. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: None +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +sup_egs_dir= +comb_affix=comb1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $conf_dir/weights.scp +fi + +left_context=`cat $chaindir/egs/info/left_context` +right_context=`cat $chaindir/egs/info/right_context` +left_context_initial=`cat $chaindir/egs/info/left_context_initial` +right_context_final=`cat $chaindir/egs/info/right_context_final` + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +treedir=exp/chain${nnet3_affix}/tree_${tree_affix} +dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +if [ $stage -le 9 ]; then + false && $decode_cmd JOB=1:$(cat $unsup_lat_dir/num_jobs) \ + ${chaindir}/best_path_${unsupervised_set}${decode_affix}/log/get_best_path.JOB.log \ + lattice-best-path --acoustic-scale=1.0 \ + "ark:gunzip -c $unsup_lat_dir/lat.JOB.gz |" ark:/dev/null \ + "ark:| gzip -c > ${chaindir}/best_path_${unsupervised_set}${decode_affix}/ali.JOB.gz" + + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +supervised_set=${supervised_set}_sp +sup_lat_dir=exp/chain${nnet3_affix}/tri5a_${supervised_set}_lats + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + + left_context=`cat $chaindir/egs/info/left_context` + right_context=`cat $chaindir/egs/info/right_context` + left_context_initial=`cat $chaindir/egs/info/left_context_initial` + right_context_final=`cat $chaindir/egs/info/right_context_final` + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 10 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + left_context=`cat $sup_egs_dir/info/left_context` + right_context=`cat $sup_egs_dir/info/right_context` + left_context_initial=`cat $sup_egs_dir/info/left_context_initial` + right_context_final=`cat $sup_egs_dir/info/right_context_final` + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg +unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $conf_dir/weights.scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $chaindir \ + $unsup_lat_dir $unsup_egs_dir +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 12 ]; then + + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --samples-per-iter 10000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $stage -le 13 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_f.sh new file mode 100644 index 00000000000..faef0c70546 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_f.sh @@ -0,0 +1,347 @@ +#!/bin/bash + +# This script is similar to _e but uses deriv weights from lattice-posteriors +# instead of from calibrated confidences. +# But there is a minor bug in creating the lattice posteriors when this +# script was run. An acwt of 1.0 was used for lattice-best-path when it +# should have been 0.1. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posteriors (Bug when originally run) +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +sup_egs_dir= +comb_affix=comb1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_g.sh new file mode 100644 index 00000000000..9dbca030174 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_g.sh @@ -0,0 +1,383 @@ +#!/bin/bash + +# This script is same as _e but uses a weight of 1.0 for unsupervised egs. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: None +# Unsupervised weight: 1.0 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1g # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $conf_dir/weights.scp +fi + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +treedir=exp/chain${nnet3_affix}/tree_${tree_affix} +dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +if [ $stage -le 9 ]; then + $decode_cmd JOB=1:$(cat $unsup_lat_dir/num_jobs) \ + ${chaindir}/best_path_${unsupervised_set}${decode_affix}/log/get_best_path.JOB.log \ + lattice-best-path --acoustic-scale=1.0 \ + "ark:gunzip -c $unsup_lat_dir/lat.JOB.gz |" ark:/dev/null \ + "ark:| gzip -c > ${chaindir}/best_path_${unsupervised_set}${decode_affix}/ali.JOB.gz" + + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +supervised_set=${supervised_set}_sp +sup_lat_dir=exp/chain${nnet3_affix}/tri5a_${supervised_set}_lats + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + if [ $stage -le 10 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + left_context=`cat $sup_egs_dir/info/left_context` + right_context=`cat $sup_egs_dir/info/right_context` + left_context_initial=`cat $sup_egs_dir/info/left_context_initial` + right_context_final=`cat $sup_egs_dir/info/right_context_final` + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $conf_dir/weights.scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $chaindir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 12 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --samples-per-iter 10000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $stage -le 13 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_h.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_h.sh new file mode 100644 index 00000000000..866f310c0ed --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_h.sh @@ -0,0 +1,348 @@ +#!/bin/bash + +# This script is same as _g, but uses deriv weights from lattice posteriors +# instead of calibrated confidences. But there was a bug when running this +# script. (An acwt of 1.0 was used for lattice-best-path instead of 0.1) +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posteriors (Bug when originally run) +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1h # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_i.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_i.sh new file mode 100644 index 00000000000..69e29d600c9 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_i.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is similar to _h, but uses unsup_frames_per_eg of 300. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=300 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1i # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_j.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_j.sh new file mode 100644 index 00000000000..6d98f9cf6da --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_j.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _k, but uses a weight of 0.5 for unsupervised egs. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=300 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1j # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_k.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_k.sh new file mode 100644 index 00000000000..96d101ac2f2 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_k.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _f, but uses an lm-scale of 0.1. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.1 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1k # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_l.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_l.sh new file mode 100644 index 00000000000..371bfcfc1b6 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_l.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _f, but uses an lm-scale of 0.5. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1l # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_m.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_m.sh new file mode 100644 index 00000000000..b608e77e309 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_m.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1m # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_n.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_n.sh new file mode 100644 index 00000000000..b463ed56485 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_n.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _c, but redone to be consistent with _m. +# So it does not have any deriv weights. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1n # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_o.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_o.sh new file mode 100644 index 00000000000..b4e9e1e5faf --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_o.sh @@ -0,0 +1,341 @@ +#!/bin/bash + +# This script is same as _a, but re-done to be consistent with _m. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1o # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=1 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_p.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_p.sh new file mode 100644 index 00000000000..7137523c843 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_p.sh @@ -0,0 +1,340 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb270k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train +semi_affix=270k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a_20k # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1p # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_q.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_q.sh new file mode 100644 index 00000000000..cf12901f617 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_q.sh @@ -0,0 +1,340 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb270k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup_20k +semi_affix=270k_conf_pca # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a_20k # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1q # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/fisher_train_lms.sh b/egs/fisher_english/s5/local/fisher_train_lms.sh index 881d3ce9466..811e09dec6d 100755 --- a/egs/fisher_english/s5/local/fisher_train_lms.sh +++ b/egs/fisher_english/s5/local/fisher_train_lms.sh @@ -6,6 +6,14 @@ text=data/train_all/text lexicon=data/local/dict/lexicon.txt +dir=data/local/lm + +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0 [options]" + exit 1 +fi for f in "$text" "$lexicon"; do [ ! -f $x ] && echo "$0: No such file $f" && exit 1; @@ -17,7 +25,6 @@ done #data/train_all/text #data/local/dict/lexicon.txt -dir=data/local/lm mkdir -p $dir export LC_ALL=C # You'll get errors about things being not sorted, if you # have a different locale. @@ -70,6 +77,8 @@ cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1] train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; +train_lm.sh --arpa --lmtype 4gram-mincount $dir || exit 1; + # Perplexity over 88307.000000 words (excluding 691.000000 OOVs) is 71.241332 # note: output is diff --git a/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh b/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh index 906703953a1..6c41a2c3e58 100755 --- a/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh +++ b/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh @@ -136,8 +136,21 @@ fi if [ $stage -le 2 ]; then echo "$0: pruning the LM (to larger size)" # Using 5 million n-grams for a big LM for rescoring purposes. - prune_lm_dir.py --target-num-ngrams=$num_ngrams_large --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big - + prune_lm_dir.py --target-num-ngrams=$num_ngrams_large --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big \ + 2> >(tee -a ${dir}/data/lm_${order}_prune_big/prune_lm.log >&2) || true + + if [ ! -f ${dir}/data/lm_${order}_prune_big/metaparameters ]; then + if [ -z `tail ${dir}/data/lm_${order}_prune_big/prune_lm.log | grep "can not do any pruning"` ]; then + echo "$0: LM could not be pruned. Something went wrong!" + exit 1 + fi + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz + echo "$0: No pruning necessary as num-ngrams is less than target" + exit 0 + fi + get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_big/perplexity_test.log get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_big/perplexity_real_dev_set.log @@ -148,9 +161,21 @@ fi if [ $stage -le 3 ]; then echo "$0: pruning the LM (to smaller size)" - # Using 2.5 million n-grams for a smaller LM for graph building. - # Prune from the bigger-pruned LM, it'll be faster. - prune_lm_dir.py --target-num-ngrams=$num_ngrams_small ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + # Using 2.5 million n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + prune_lm_dir.py --target-num-ngrams=$num_ngrams_small ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small \ + 2> >(tee -a ${dir}/data/lm_${order}_prune_small/prune_lm.log >&2) || true + + if [ ! -f ${dir}/data/lm_${order}_prune_small/metaparameters ]; then + if [ -z `tail ${dir}/data/lm_${order}_prune_small/prune_lm.log | grep "can not do any pruning"` ]; then + echo "$0: LM could not be pruned. Something went wrong!" + exit 1 + fi + + ln -s ${order}gram_big.arpa.gz $dir/data/arpa/${order}gram_small.arpa.gz + exit 0 + fi + get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_small/perplexity_test.log diff --git a/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh b/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh index b203f9638b4..736459a8113 100755 --- a/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh +++ b/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh @@ -95,6 +95,7 @@ if [ $stage -le 6 ]; then steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ data/${ivector_train_set}_hires $exp_root/nnet3${nnet3_affix}/diag_ubm \ $exp_root/nnet3${nnet3_affix}/extractor || exit 1; +>>>>>>> e8b4f50d30df411bb156ff3927a41f20f6cffa99 fi if [ $stage -le 7 ]; then diff --git a/egs/fisher_english/s5/local/nnet3/run_ivector_common_pca.sh b/egs/fisher_english/s5/local/nnet3/run_ivector_common_pca.sh new file mode 100755 index 00000000000..e159781e9a1 --- /dev/null +++ b/egs/fisher_english/s5/local/nnet3/run_ivector_common_pca.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +. ./cmd.sh +set -e +stage=1 +speed_perturb=true +train_set=train +ivector_train_set= # data set for training i-vector extractor. + # If not provided, train_set will be used. + +nnet3_affix= +exp=exp + +. ./path.sh +. ./utils/parse_options.sh + +# perturbed data preparation +if [ "$speed_perturb" == "true" ]; then + if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have + # to perturb the normal data to get the alignments. + # _sp stands for speed-perturbed + + for datadir in ${train_set} ${ivector_train_set}; do + utils/data/perturb_data_dir_speed_3way.sh data/${datadir} data/${datadir}_sp + utils/fix_data_dir.sh data/${datadir}_sp + + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ + data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_sp + done + fi + train_set=${train_set}_sp + if ! [ -z "$ivector_train_set" ]; then + ivector_train_set=${ivector_train_set}_sp + fi +fi + +if [ $stage -le 3 ]; then + mfccdir=mfcc_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5b/$mfccdir/storage $mfccdir/storage + fi + + for dataset in $ivector_train_set $train_set; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + utils/data/perturb_data_dir_volume.sh data/${dataset}_hires + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir; + + # Remove the small number of utterances that couldn't be extracted for some + # reason (e.g. too short; no such file). + utils/fix_data_dir.sh data/${dataset}_hires; + done + + for dataset in test dev; do + # Create MFCCs for the eval set + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \ + data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems + done +fi + +if [ -z "$ivector_train_set" ]; then + ivector_train_set=$train_set +fi + +# ivector extractor training +if [ $stage -le 4 ]; then + steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + --max-utts 10000 --subsample 2 \ + data/${ivector_train_set}_hires \ + $exp/nnet3${nnet3_affix}/pca_transform +fi + +if [ $stage -le 5 ]; then + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${ivector_train_set}_hires 512 \ + $exp/nnet3${nnet3_affix}/pca_transform $exp/nnet3${nnet3_affix}/diag_ubm +fi + +if [ $stage -le 6 ]; then + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${ivector_train_set}_hires $exp/nnet3${nnet3_affix}/diag_ubm $exp/nnet3${nnet3_affix}/extractor || exit 1; +fi + +if [ $stage -le 7 ]; then + # We extract iVectors on all the ${train_set} data, which will be what we + # train the system on. + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${ivector_train_set}_hires data/${ivector_train_set}_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${ivector_train_set}_max2_hires $exp/nnet3${nnet3_affix}/extractor $exp/nnet3${nnet3_affix}/ivectors_${ivector_train_set}_hires || exit 1; +fi + +if [ $stage -le 8 ]; then + for dataset in test dev; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${dataset}_hires $exp/nnet3${nnet3_affix}/extractor $exp/nnet3${nnet3_affix}/ivectors_${dataset}_hires || exit 1; + done +fi + +exit 0; + diff --git a/egs/fisher_english/s5/local/nnet3/run_tdnn.sh b/egs/fisher_english/s5/local/nnet3/run_tdnn.sh new file mode 100644 index 00000000000..f055b853b61 --- /dev/null +++ b/egs/fisher_english/s5/local/nnet3/run_tdnn.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +# This script is not tested. + +# this is the standard "tdnn" system, built in nnet3; it's what we used to +# call multi-splice. + +. ./cmd.sh + + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +stage=0 +affix= +train_stage=-10 +common_egs_dir= +reporting_email= +remove_egs=true + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..1308339ad93 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_b.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 3gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_500k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_500k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..5ac69af585f --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_500k_semisupervised_conf_a.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 6,2 +# LM for decoding unsupervised data: 3gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7f # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_500k_1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=6,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_f +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_500k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_500k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..d345ca5f20e --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_500k_semisupervised_conf_b.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 6,2 +# LM for decoding unsupervised data: 3gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_500k_1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=6,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_b.sh new file mode 100755 index 00000000000..22fc833f613 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_b.sh @@ -0,0 +1,197 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 100 hours. +# This is similar to _b, but uses an extra layer. + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix= +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c.sh new file mode 100755 index 00000000000..f6b94ee594c --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c.sh @@ -0,0 +1,198 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 100 hours. +# This is similar to _b, but uses a bi-phone tree with 7000 leaves + +# configs for 'chain' +stage=0 +tdnn_affix=7c +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_a +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c_oracle.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c_oracle.sh new file mode 100755 index 00000000000..d14aa752c14 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c_oracle.sh @@ -0,0 +1,210 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 100 hours. +# This is similar to _b, but uses an extra layer. + +# configs for 'chain' +stage=0 +tdnn_affix=7c_oracle +train_stage=-10 +get_egs_stage=-10 +decode_iter= +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k_n10k +base_train_set=train_oracle100k_250k_n10k +tree_affix=bi_a +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ ! -f $treedir/final.mdl ]; then + echo "$0: Could not find $treedir/final.mdl" + exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh new file mode 100755 index 00000000000..6fe8cdececc --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh @@ -0,0 +1,280 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 100 hours. +# This is similar to _b, but uses a bi-phone tree with 7000 leaves + +# configs for 'chain' +stage=0 +tdnn_affix=7d +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_d +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# smbr finetuning +do_smbr_finetuning=false + +finetune_num_extra_lm_states=2000 +finetune_stage=-1 # Set this lower to train den.fst +finetune_suffix=_smbr +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_l2_regularize=0.00005 +finetune_opts="--chain.mmi-factor-schedule=0.0,0.0 --chain.smbr-factor-schedule=1,1" +finetune_leaky_hmm_coefficient=0.001 +finetune_apply_deriv_weights=true +finetune_lr=0.000005 +chain_smbr_extra_opts= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi + +if ! $do_smbr_finetuning; then + wait + exit 0; +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ ! -z "$common_egs_dir" ]; then + egs_dir=$common_egs_dir + else + egs_dir=$dir/egs + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient $finetune_leaky_hmm_coefficient \ + --chain.l2-regularize $finetune_l2_regularize \ + --chain.apply-deriv-weights $finetune_apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=$finetune_num_extra_lm_states" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate $finetune_lr \ + --trainer.optimization.final-effective-lrate $(perl -e "print $finetune_lr * 0.1") \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir --lang $lang \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_e.sh new file mode 100755 index 00000000000..36f9107039b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_e.sh @@ -0,0 +1,198 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 100 hours. +# This is similar to _b, but uses a bi-phone tree with 7000 leaves + +# configs for 'chain' +stage=0 +tdnn_affix=7e +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_e +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_f.sh new file mode 100755 index 00000000000..b6aa3520d3b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_f.sh @@ -0,0 +1,199 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 100 hours. +# This is similar to _b, but uses a bi-phone tree with 7000 leaves + +# configs for 'chain' +stage=0 +tdnn_affix=7f +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_f +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..17e0d7609a8 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a.sh @@ -0,0 +1,463 @@ +#!/bin/bash + +# This script is semi-supervised training with 100 hours supervised data +# and 250 hours unsupervised data with naive splitting. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a2.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a2.sh new file mode 100644 index 00000000000..2e6c8bc36f3 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a2.sh @@ -0,0 +1,464 @@ +#!/bin/bash + +# This script is semi-supervised training with 100 hours supervised data +# and 250 hours unsupervised data with naive splitting. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a2 # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..9331642f43b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh @@ -0,0 +1,462 @@ +#!/bin/bash + +# This script is similar to _a but uses smart splitting. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_c.sh new file mode 100644 index 00000000000..a7121dcb8dd --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_c.sh @@ -0,0 +1,463 @@ +#!/bin/bash + +# This script is semi-supervised training with 100 hours supervised data +# and 250 hours unsupervised data with naive splitting. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7c # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1c2 # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_a +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_d.sh new file mode 100644 index 00000000000..31ac754ffef --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_d.sh @@ -0,0 +1,462 @@ +#!/bin/bash + +# This script is similar to _c, but uses smart splitting. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7c # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1d2 # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_a +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_e.sh new file mode 100644 index 00000000000..2ffa5320d0f --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_e.sh @@ -0,0 +1,465 @@ +#!/bin/bash + +# This script is similar to _d, but uses 3gram LM with best path from 4gram LM. + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 3gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7c # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_a +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}_fg/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix}_fg \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}_fg/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_f.sh new file mode 100644 index 00000000000..5caf0bbb00c --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_f.sh @@ -0,0 +1,465 @@ +#!/bin/bash + +# This is semi-supervised training with 500 hours of unsupervised data. +# This script is similar to _d, but with 500 hours sunsupervised data. + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7c # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_a +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh new file mode 100644 index 00000000000..dbfde9787aa --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh @@ -0,0 +1,471 @@ +#!/bin/bash + +# This is semi-supervised training with 500 hours of unsupervised data. +# This script is similar to _f, but uses 3gram LM. + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 3gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7c # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= +rescore_unsup_lattices=true + +# Semi-supervised options +comb_affix=comb1g # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_a +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_h.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_h.sh new file mode 100644 index 00000000000..99311ab9887 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_h.sh @@ -0,0 +1,471 @@ +#!/bin/bash + +# This is semi-supervised training with 500 hours of unsupervised data. +# This script is similar _g, but Naive split lattices + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 3gram +# Supervision: Naive split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7c # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= +rescore_unsup_lattices=false + +# Semi-supervised options +comb_affix=comb1h # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_a +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh new file mode 100755 index 00000000000..d4cb820a03b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh @@ -0,0 +1,193 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 10 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup11k +unsup_train_set=train_unsup_250k_240k +tree_affix= +nnet3_affix=_semi11k_250k +chain_affix=_semi11k_250k +exp=exp/semisup_11k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..245f25641b9 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh @@ -0,0 +1,365 @@ +#!/bin/bash + +# This script is for semi-supervised training with 250h unsupervised set and +# around 10-15h supervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 0.3 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 3gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..9b7a424b897 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_b.sh @@ -0,0 +1,362 @@ +#!/bin/bash + +# This script is same as _a, but uses no deriv weights. +# unsup_frames_per_eg=150 +# Deriv weights: None +# Unsupervised weight: 0.3 +# Weights for phone LM (supervised, unsupervises): 5,2 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_c.sh new file mode 100644 index 00000000000..60f64dee299 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_c.sh @@ -0,0 +1,359 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_d.sh new file mode 100644 index 00000000000..23c58768b04 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_d.sh @@ -0,0 +1,379 @@ +#!/bin/bash + +# This script is same as _a, but uses 4gram LM for generating unsupervised data lattices. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 0.3 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1d # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh new file mode 100644 index 00000000000..9f6d3a23b8a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh @@ -0,0 +1,445 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --weights $lm_weights --cmd "$train_cmd" --lm-opts "$lm_opts" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e_old.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e_old.sh new file mode 100644 index 00000000000..567cee619a0 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e_old.sh @@ -0,0 +1,450 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1e_old # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --weights $lm_weights --cmd "$train_cmd" --lm-opts "$lm_opts" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_f.sh new file mode 100644 index 00000000000..ac986ce6dda --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_f.sh @@ -0,0 +1,379 @@ +#!/bin/bash + +# This script is same as _e, but is run for 3 epochs instead of 4. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_g.sh new file mode 100644 index 00000000000..12909f33e15 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_g.sh @@ -0,0 +1,379 @@ +#!/bin/bash + +# This script is same as _f, but uses 300 frames-per-eg +# unsup_frames_per_eg=300 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=300 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1g # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_h.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_h.sh new file mode 100644 index 00000000000..c3f1cabc81a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_h.sh @@ -0,0 +1,385 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1h # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +rnnlm_weight=0.5 +rnnlm_dir=data/tf_fast_lstm_ex250k +rnnlm_affix=unk.fast.tfrnnlm + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix:+${tree_affix}_}${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_i.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_i.sh new file mode 100644 index 00000000000..6afb3e2276f --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_i.sh @@ -0,0 +1,381 @@ +#!/bin/bash + +# This script is same as _f, but uses a separate silence tolerance. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +sil_tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1i # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance}_sil${sil_tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-tolerance-silence $sil_tolerance --right-tolerance-silence $sil_tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_j.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_j.sh new file mode 100644 index 00000000000..c3c0db77856 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_j.sh @@ -0,0 +1,465 @@ +#!/bin/bash + +# This script is same as _f, but uses UNK model. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram with UNK model + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +sil_tolerance= +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1j # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance}${sil_tolerance:+_sil$sil_tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + ${sil_tolerance:+--left-tolerance-silence $sil_tolerance --right-tolerance-silence $sil_tolerance} \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_k.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_k.sh new file mode 100644 index 00000000000..8f83f1a3529 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_k.sh @@ -0,0 +1,406 @@ +#!/bin/bash + +# This script is same as _f, but uses UNK model. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram with UNK model + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +sil_tolerance= +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1k # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0" + exit 1 +fi + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance}${sil_tolerance:+_sil$sil_tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + ${sil_tolerance:+--left-tolerance-silence $sil_tolerance --right-tolerance-silence $sil_tolerance} \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${lang_test_suffix}${unk_prob_scale:+=_unkscale$unk_prob_scale} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${unk_prob_scale:+--unk-prob-scale $unk_prob_scale} \ + data/lang_test${lang_test_suffix} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_l.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_l.sh new file mode 100644 index 00000000000..7e5d41887cd --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_l.sh @@ -0,0 +1,491 @@ +#!/bin/bash + +# This script is same as _f, but uses UNK model. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram with UNK model + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +sil_tolerance= +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1l # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +do_finetuning=false +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance}${sil_tolerance:+_sil$sil_tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/subset_ali_dir.sh --cmd "$train_cmd" \ + data/${supervised_set} data/${supervised_set}_sp_hires \ + ${chaindir}_lats_${supervised_set}_sp \ + ${chaindir}_best_path_${supervised_set} || exit 1 + echo $frame_subsampling_factor > ${chaindir}_best_path_${supervised_set}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} ${chaindir}_best_path_${supervised_set} \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + ${sil_tolerance:+--left-tolerance-silence $sil_tolerance --right-tolerance-silence $sil_tolerance} \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + ${sil_tolerance:+--left-tolerance-silence $sil_tolerance --right-tolerance-silence $sil_tolerance} \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_unk +if [ $stage -le 17 ] && [ ! -f $graph_dir/HCLG.fst ]; then + if [ ! -f data/lang_test_unk/L_disambig.fst ]; then + utils/prepare_lang.sh --unk-fst exp/unk_lang_model/unk_fst.txt \ + data/local/dict "$(cat data/lang/oov.txt)" data/local/lm data/lang_test_unk + + cp data/lang_test/G.fst data/lang_test_unk/ + fi + + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_unk $dir $graph_dir +fi + +decode_suffix=_unk +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${decode_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait; exit 0; +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh new file mode 100644 index 00000000000..6c3fd38deff --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh @@ -0,0 +1,437 @@ +#!/bin/bash + +# This script is same as _e, but uses tree trained only on supervised data. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion +apply_deriv_weights=true + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1m # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + mkdir -p $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh new file mode 100644 index 00000000000..4d92f6df1e0 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh @@ -0,0 +1,443 @@ +#!/bin/bash + +# This script is same as _m, but uses split lattices for supervision. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1n # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_q.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_q.sh new file mode 100644 index 00000000000..5207acd410c --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_q.sh @@ -0,0 +1,436 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1q # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp "" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + #$chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_r.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_r.sh new file mode 100644 index 00000000000..4b083f356b2 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_r.sh @@ -0,0 +1,436 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1r # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_a.sh new file mode 100755 index 00000000000..a9832070763 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_a.sh @@ -0,0 +1,441 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=best_path_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --keep-only-best-path true \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh new file mode 100755 index 00000000000..774ce524221 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh @@ -0,0 +1,474 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=best_path_comb1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $out_dir/lat.JOB.gz" || exit 1 + fi + fi + + if [ ! -f $chaindir/decode_${dset}_sp${det_decode_affix}_fg/lat.1.gz ]; then + if [ $stage -le 6 ]; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" --write-compact false \ + data/lang_test${graph_affix} \ + data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ + $chaindir/decode_${dset}_sp${det_decode_affix} \ + $chaindir/decode_${dset}_sp${det_decode_affix}_fg + fi + fi +done + +if [ ! -f $chaindir/best_path_${unsupervised_set}_sp${det_decode_affix}_fg/ali.1.gz ]; then + if [ $stage -le 8 ]; then + steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ + data/${unsupervised_set}_sp_hires data/lang_chain \ + $chaindir/decode_${unsupervised_set}_sp${det_decode_affix}_fg \ + $chaindir/best_path_${unsupervised_set}_sp${det_decode_affix}_fg + fi +fi + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri3 + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +this_nj=$(cat $chaindir/decode_${unsupervised_set}_sp${decode_affix}/num_jobs) + +if [ $stage -le 9 ]; then + out_dir=$chaindir/best_path_lats_${unsupervised_set}_sp${decode_affix}_fg + $train_cmd JOB=1:$this_nj $out_dir/log/get_best_path_lats.JOB.log \ + lattice-interp "ark:gunzip -c $chaindir/decode_${unsupervised_set}_sp${decode_affix}/lat.JOB.gz |" \ + "ark:gunzip -c $chaindir/decode_${unsupervised_set}_sp${det_decode_affix}_fg/lat.JOB.gz | lattice-1best --acoustic-scale=0.1 ark:- ark:- |" \ + "ark:| gzip -c > $out_dir/lat.JOB.gz" + + echo $this_nj > $out_dir/num_jobs +fi + +ln -sf ../final.mdl $chaindir/best_path_lats_${unsupervised_set}_sp${decode_affix}_fg/final.mdl + +echo $this_nj > $chaindir/best_path_lats_${unsupervised_set}_sp${decode_affix}_fg/num_jobs + +decode_affix=${decode_affix}_fg + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/subset_ali_dir.sh --cmd "$train_cmd" \ + data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ + $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} + echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/best_path_lats_${unsupervised_set}${decode_affix} + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${det_decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_c.sh new file mode 100755 index 00000000000..6c5eb9679c6 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_c.sh @@ -0,0 +1,454 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=best_path_comb1c # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/1best_lats_${unsupervised_set}${decode_affix} + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_c.sh new file mode 100755 index 00000000000..43f3505d545 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_c.sh @@ -0,0 +1,194 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is based on run_tdnn_11k.sh, but uses a chunk width of 160,140,110,80. + +# configs for 'chain' +stage=0 +tdnn_affix=7c +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +ivector_train_set=semisup15k_250k +tree_affix= +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_d.sh new file mode 100755 index 00000000000..1541d8c8e02 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_d.sh @@ -0,0 +1,196 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _c, but uses a biphone tree with up to 7000 leaves. + +# configs for 'chain' +stage=0 +tdnn_affix=7d +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +ivector_train_set=semisup15k_250k +tree_affix=bi_d +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_e.sh new file mode 100755 index 00000000000..91d938e5f42 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_e.sh @@ -0,0 +1,196 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 10 hours. +# This is similar to _d, but uses a biphone tree with up to 2000 leaves. + +# configs for 'chain' +stage=0 +tdnn_affix=7e +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +ivector_train_set=semisup15k_250k +tree_affix=e +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 2000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_f.sh new file mode 100755 index 00000000000..906d8eeca98 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_f.sh @@ -0,0 +1,196 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 10 hours. +# This is similar to _d, but uses a biphone tree with up to 4000 leaves. + +# configs for 'chain' +stage=0 +tdnn_affix=7f +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +ivector_train_set=semisup15k_250k +tree_affix=f +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 4000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_g.sh new file mode 100644 index 00000000000..47160845f30 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_g.sh @@ -0,0 +1,178 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _c, but uses a biphone tree with up to 7000 leaves. + +# configs for 'chain' +stage=0 +tdnn_affix=7g +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +unsupervised_set=train_unsup250k_240k +semisup_set=semisup15k_250k +tree_affix=bi_g +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +orig_treedir=exp/semisup_15k/chain_semi15k_250k/tree_bi_d +unsup_alidir=exp/semisup_15k/chain_semi15k_250k/tdnn7d_sp/best_path_train_unsup250k_240k_unphdet_ex250k_fg + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_h.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_h.sh new file mode 100644 index 00000000000..978df45345f --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_h.sh @@ -0,0 +1,178 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but trains tree using even unsupervised data. + +# configs for 'chain' +stage=0 +tdnn_affix=7h +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +unsupervised_set=train_unsup250k_240k +semisup_set=semisup15k_250k +tree_affix=bi_h +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +sup_alidir=exp/semisup_15k/chain_semi15k_250k/tri3_train_sup15k_sp_ali +unsup_alidir=exp/semisup_15k/chain_semi15k_250k/tdnn7d_sp/best_path_train_unsup250k_240k_sp_unphdet_ex250k_fg + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_i.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_i.sh new file mode 100755 index 00000000000..4ab3fa480d5 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_i.sh @@ -0,0 +1,197 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but uses a phone LM UNK model + +# configs for 'chain' +stage=0 +tdnn_affix=7i +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +ivector_train_set=semisup15k_250k +tree_affix=bi_i +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_j.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_j.sh new file mode 100755 index 00000000000..0c39841d7ef --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_j.sh @@ -0,0 +1,197 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but uses a speed-perturbed data for tree building + +# configs for 'chain' +stage=0 +tdnn_affix=7j +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +ivector_train_set=semisup15k_250k +tree_affix=bi_j +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aa.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aa.sh new file mode 100644 index 00000000000..c02005540f5 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aa.sh @@ -0,0 +1,449 @@ +#!/bin/bash + +# This script is same as _z, but uses 7d as seed model and bi_d tree. +# sup_frames_per_eg=160,140,110,80 +# unsup_frames_per_eg=160,140,110,80 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg= +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1aa # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ab.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ab.sh new file mode 100644 index 00000000000..50ed5f5ee6b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ab.sh @@ -0,0 +1,450 @@ +#!/bin/bash + +# This script is same as _z, but does rescoring correctly. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg= +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ab # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ac.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ac.sh new file mode 100644 index 00000000000..973bbf93c6a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ac.sh @@ -0,0 +1,452 @@ +#!/bin/bash + +# This script is same as _ab, but uses frames_per_eg 150. This is same as _z with rescoring done correctly. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ac # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +remove_egs=true + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + ! $remove_egs && touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + ! $remove_egs && touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ad.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ad.sh new file mode 100644 index 00000000000..9b0c9e8560c --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ad.sh @@ -0,0 +1,452 @@ +#!/bin/bash + +# This script is same as _ac, but uses naive splitting. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ad # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +remove_egs=true + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + ! $remove_egs && touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + ! $remove_egs && touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ae.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ae.sh new file mode 100644 index 00000000000..68ac5bc51f9 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ae.sh @@ -0,0 +1,449 @@ +#!/bin/bash + +# This script is same as _aa, but uses frames-per-eg of 150 +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ae # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_af.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_af.sh new file mode 100644 index 00000000000..9e6814c3328 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_af.sh @@ -0,0 +1,449 @@ +#!/bin/bash + +# This script is same as _ac, but uses naive splitting. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1af # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ag.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ag.sh new file mode 100644 index 00000000000..2d86b39564d --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ag.sh @@ -0,0 +1,469 @@ +#!/bin/bash + +# This script is same as _z, but does rescoring correctly. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg= +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ag # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ah.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ah.sh new file mode 100644 index 00000000000..9e2a3bb088d --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ah.sh @@ -0,0 +1,451 @@ +#!/bin/bash + +# This script is same as _ae, but uses naive splitting. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ah # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ai.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ai.sh new file mode 100644 index 00000000000..3c08529e985 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ai.sh @@ -0,0 +1,455 @@ +#!/bin/bash + +# This script is same as _ae, but uses larger 7h tree +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7h # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ai # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_h +unsup_egs_opts= +apply_deriv_weights=true +rescore_unsup_lattices=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh new file mode 100644 index 00000000000..cf6bcf82bc2 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh @@ -0,0 +1,471 @@ +#!/bin/bash + +# This script is same as _ae, but uses 7i model and tree with UNK decoding +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7i # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1aj # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_i +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + echo "$0: generating egs from the unsupervised data" + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ak.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ak.sh new file mode 100644 index 00000000000..eff2ab7bfa6 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ak.sh @@ -0,0 +1,471 @@ +#!/bin/bash + +# This script is same as _aj, but uses 7i model and supervised lattices from UNK phone LM alignment. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7i # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ak # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_i +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + echo "$0: generating egs from the unsupervised data" + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_al.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_al.sh new file mode 100644 index 00000000000..4a12d96c5bf --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_al.sh @@ -0,0 +1,471 @@ +#!/bin/bash + +# This script is same as _aj, but uses pocolm for LM. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7j # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1al # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_j +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + echo "$0: generating egs from the unsupervised data" + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_am.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_am.sh new file mode 100644 index 00000000000..d0e21e0c25a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_am.sh @@ -0,0 +1,470 @@ +#!/bin/bash + +# This script is similar to _al, but builds a larger tree using unsupervised data. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7j # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1am # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + sup_ali_dir=$exp/tri3 + + if [ -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl exists. Remove it and run again." + exit 1 + fi + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --use-fmllr false --context-opts "--context-width=2 --central-position=1" \ + --frame-subsampling-factor 3 \ + 7000 $lang \ + data/${supervised_set} \ + ${sup_ali_dir} \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir || exit 1 +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_an.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_an.sh new file mode 100644 index 00000000000..eb6cdca6b0c --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_an.sh @@ -0,0 +1,471 @@ +#!/bin/bash + +# This script is same as _al, but different frames-per-eg for supervised. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7j # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=160,140,110,80 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1an # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_j +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + echo "$0: generating egs from the unsupervised data" + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ao.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ao.sh new file mode 100644 index 00000000000..b4d83f83a73 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ao.sh @@ -0,0 +1,471 @@ +#!/bin/bash + +# This script is same as _ak, but uses different frames-per-eg for supervised. +# sup_frames_per_eg=160,140,110,80 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7i # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=160,140,110,80 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ao # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_i +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + echo "$0: generating egs from the unsupervised data" + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ap.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ap.sh new file mode 100644 index 00000000000..8b94b46f3bb --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ap.sh @@ -0,0 +1,471 @@ +#!/bin/bash + +# This script is same as _ao, but uses creates denominator FST using speed-perturbed data. +# sup_frames_per_eg=160,140,110,80 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7i # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=160,140,110,80 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ap # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_i +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + echo "$0: generating egs from the unsupervised data" + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aq.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aq.sh new file mode 100644 index 00000000000..7cd4f890d6a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aq.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# This script is same as _ao, but uses smaller weight on supervised phone alignments +# because they are from speed perturbed data. +# sup_frames_per_eg=160,140,110,80 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,6 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7i # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=160,140,110,80 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1aq # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,6 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_i +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + echo "$0: generating egs from the unsupervised data" + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_o.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_o.sh new file mode 100644 index 00000000000..e3d1aae047b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_o.sh @@ -0,0 +1,445 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" --lm-opts "$lm_opts" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_p.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_p.sh new file mode 100644 index 00000000000..19d17ef8418 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_p.sh @@ -0,0 +1,454 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= +unsup_egs_opts= + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1p # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" --lm-opts "$lm_opts" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split_and_convert.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + --splitter-opts "--add-partial-unk-label-left --add-partial-unk-label-right" \ + $unsup_egs_opts \ + data/${unsupervised_set}_hires data/lang_chain $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_q.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_q.sh new file mode 100644 index 00000000000..d587196ce9f --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_q.sh @@ -0,0 +1,473 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1q # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128,64/300=64,32/600=32,16,8/1200=16,8,4" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" --lm-opts "$lm_opts" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp_sil +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --no-chunking true \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_r.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_r.sh new file mode 100644 index 00000000000..e7179b3bc76 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_r.sh @@ -0,0 +1,437 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1r # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --trainer.objective-scales="output-1-xent:0.5" \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh new file mode 100644 index 00000000000..3bd721ba32c --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1s # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $out_dir/lat.JOB.gz" || exit 1 + fi + fi + + if [ ! -f $chaindir/decode_${dset}_sp${det_decode_affix}_fg/lat.1.gz ]; then + if [ $stage -le 6 ]; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" --write-compact false \ + data/lang_test${graph_affix} \ + data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ + $chaindir/decode_${dset}_sp${det_decode_affix} \ + $chaindir/decode_${dset}_sp${det_decode_affix}_fg + fi + fi + + if [ $stage -le 7 ]; then + out_dir=$chaindir/decode_${dset}_sp${decode_affix}_fg + mkdir -p $out_dir + + $decode_cmd JOB=1:$decode_nj $out_dir/log/compose_lat.JOB.log \ + lattice-interp --alpha=0 --write-compact=false \ + "ark:gunzip -c $chaindir/decode_${dset}_sp${decode_affix}/lat.JOB.gz |" \ + "ark:gunzip -c $chaindir/decode_${dset}_sp${det_decode_affix}_fg/lat.JOB.gz |" \ + "ark:| gzip -c > $out_dir/lat.JOB.gz" + echo $decode_nj > $out_dir/num_jobs + + ln -sf ../final.mdl $out_dir/final.mdl || true + fi +done + +decode_affix=${decode_affix}_fg + +if [ $stage -le 8 ]; then + steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ + data/${unsupervised_set}_sp_hires data/lang_chain \ + $chaindir/decode_${unsupervised_set}_sp${det_decode_affix} \ + $chaindir/best_path_${unsupervised_set}_sp${det_decode_affix} +fi + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri3 + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/subset_ali_dir.sh --cmd "$train_cmd" \ + data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ + $chaindir/best_path_${unsupervised_set}_sp${det_decode_affix} \ + $chaindir/best_path_${unsupervised_set}${det_decode_affix} + echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${det_decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${det_decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${det_decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh new file mode 100644 index 00000000000..591df3c8aab --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh @@ -0,0 +1,440 @@ +#!/bin/bash + +# This script is same as _n, but uses RNN-LM for decoding. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: tf-fast-lstm + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1t # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +# RNN-LM opts +rnnlm_weight=0.5 +rnnlm_dir=data/tf_fast_lstm_ex250k +rnnlm_affix=unk.fast.tfrnnlm +rnnlm_beam= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_u.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_u.sh new file mode 100755 index 00000000000..920b4317669 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_u.sh @@ -0,0 +1,466 @@ +#!/bin/bash + +# This script is same as _t, but uses speed perturbation instead of +# silence padding for creating discrete length utterances. +# unsup_frames_per_eg= +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1u # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128,64/300=64,32/600=32,16,8/1200=16,8,4" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" --lm-opts "$lm_opts" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_spEx +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --no-chunking true \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_v.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_v.sh new file mode 100644 index 00000000000..c7998761dd7 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_v.sh @@ -0,0 +1,437 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +arc_scale=1.0 # arc-scale for sausage arcs +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1v # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_awt${arc_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale 0.0 \ + --use-mbr-decode true --arc-scale $arc_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_w.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_w.sh new file mode 100644 index 00000000000..079575bdcc8 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_w.sh @@ -0,0 +1,428 @@ +#!/bin/bash + +# This script is same as _n, but uses trigram for decoding. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 3gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1w # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_x.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_x.sh new file mode 100644 index 00000000000..0ff5fef3be7 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_x.sh @@ -0,0 +1,427 @@ +#!/bin/bash + +# This script is same as _m, but uses 3gram LM for decoding +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 3gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion +apply_deriv_weights=true + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1x # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + mkdir -p $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_y.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_y.sh new file mode 100644 index 00000000000..ec537d2f84b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_y.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# This script is same as _s, but uses naive splitting +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1y # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $out_dir/lat.JOB.gz" || exit 1 + fi + fi + + if [ ! -f $chaindir/decode_${dset}_sp${det_decode_affix}_fg/lat.1.gz ]; then + if [ $stage -le 6 ]; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" --write-compact false \ + data/lang_test${graph_affix} \ + data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ + $chaindir/decode_${dset}_sp${det_decode_affix} \ + $chaindir/decode_${dset}_sp${det_decode_affix}_fg + fi + fi + + if [ $stage -le 7 ]; then + out_dir=$chaindir/decode_${dset}_sp${decode_affix}_fg + mkdir -p $out_dir + + $decode_cmd JOB=1:$decode_nj $out_dir/log/compose_lat.JOB.log \ + lattice-interp --alpha=0 --write-compact=false \ + "ark:gunzip -c $chaindir/decode_${dset}_sp${decode_affix}/lat.JOB.gz |" \ + "ark:gunzip -c $chaindir/decode_${dset}_sp${det_decode_affix}_fg/lat.JOB.gz |" \ + "ark:| gzip -c > $out_dir/lat.JOB.gz" + echo $decode_nj > $out_dir/num_jobs + + ln -sf ../final.mdl $out_dir/final.mdl || true + fi +done + +decode_affix=${decode_affix}_fg + +if [ $stage -le 8 ]; then + steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ + data/${unsupervised_set}_sp_hires data/lang_chain \ + $chaindir/decode_${unsupervised_set}_sp${det_decode_affix} \ + $chaindir/best_path_${unsupervised_set}_sp${det_decode_affix} +fi + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri3 + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/subset_ali_dir.sh --cmd "$train_cmd" \ + data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ + $chaindir/best_path_${unsupervised_set}_sp${det_decode_affix} \ + $chaindir/best_path_${unsupervised_set}${det_decode_affix} + echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${det_decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${det_decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${det_decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_z.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_z.sh new file mode 100644 index 00000000000..0d201a5ace8 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_z.sh @@ -0,0 +1,445 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1z # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k.sh new file mode 100755 index 00000000000..6366cfdad3a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k.sh @@ -0,0 +1,194 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 50 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup50k +ivector_train_set=semisup50k_250k +tree_affix= +nnet3_affix=_semi50k_250k +chain_affix=_semi50k_250k +exp=exp/semisup_50k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=8 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_best_path_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_best_path_a.sh new file mode 100755 index 00000000000..d4fa3f5d20b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_best_path_a.sh @@ -0,0 +1,441 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=best_path_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --keep-only-best-path true \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_c.sh new file mode 100755 index 00000000000..b24ce252642 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_c.sh @@ -0,0 +1,208 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 50 hours. +# This is similar to _b, but uses biphone tree with upto 7000 leaves. + +# configs for 'chain' +stage=0 +tdnn_affix=7c +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup50k +unsup_train_set=train_unsup100k_250k +semisup_train_set=semisup50k_100k_250k +tree_affix=bi_c +nnet3_affix=_semi50k_250k +chain_affix=_semi50k_250k +exp=exp/semisup_50k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=8 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_d.sh new file mode 100755 index 00000000000..7c008d0d879 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_d.sh @@ -0,0 +1,208 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 50 hours. +# This is similar to _c, but uses a phone LM UNK model + +# configs for 'chain' +stage=0 +tdnn_affix=7d +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup50k +unsup_train_set=train_unsup100k_250k +semisup_train_set=semisup50k_100k_250k +tree_affix=bi_d +nnet3_affix=_semi50k_250k +chain_affix=_semi50k_250k +exp=exp/semisup_50k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=8 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_poco_unk_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_e.sh new file mode 100755 index 00000000000..207dd5b40f7 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_e.sh @@ -0,0 +1,208 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 50 hours. +# This is similar to _c, but uses poco LM for decoding. + +# configs for 'chain' +stage=0 +tdnn_affix=7e +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup50k +unsup_train_set=train_unsup100k_250k +semisup_train_set=semisup50k_100k_250k +tree_affix=bi_e +nnet3_affix=_semi50k_250k +chain_affix=_semi50k_250k +exp=exp/semisup_50k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=8 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_poco_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_e.sh new file mode 100755 index 00000000000..8750c88e627 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_e.sh @@ -0,0 +1,452 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= +do_finetuning=false + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + --lm-opts "--num-extra-lm-states=2000" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh new file mode 100755 index 00000000000..00f5b2556e7 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh @@ -0,0 +1,445 @@ +#!/bin/bash + +# This script is same as _g, but split lattice supervision +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh new file mode 100755 index 00000000000..ad5d1496d66 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh @@ -0,0 +1,443 @@ +#!/bin/bash + +# This script is same as _e, but uses tree from supervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1g # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_h.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_h.sh new file mode 100755 index 00000000000..4aa1105c7c6 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_h.sh @@ -0,0 +1,453 @@ +#!/bin/bash + +# This script is same as _g, but smart splitting. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1h2 # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_i.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_i.sh new file mode 100755 index 00000000000..27fe0476727 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_i.sh @@ -0,0 +1,444 @@ +#!/bin/bash + +# This script is same as _h, but uses 3-gram LM. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 3gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1i # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}_fg/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix}_fg \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}_fg/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_j.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_j.sh new file mode 100755 index 00000000000..bfca03d9de9 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_j.sh @@ -0,0 +1,453 @@ +#!/bin/bash + +# This script is same as _i, but uses best path and weights from 4gram. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 3gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1j # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}_fg/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix}_fg \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}_fg/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_k.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_k.sh new file mode 100755 index 00000000000..1c78277cb7f --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_k.sh @@ -0,0 +1,453 @@ +#!/bin/bash + +# This script is same as _j, but uses 4gram LM. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1k # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_l.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_l.sh new file mode 100755 index 00000000000..f93aebb4027 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_l.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# This script is same as _k, but uses biphone tree. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1l # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width `cat $comb_egs_dir/info/frames_per_eg` \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_m.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_m.sh new file mode 100755 index 00000000000..f85a34660bd --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_m.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# This script is same as _k, but uses biphone tree. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1m # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width `cat $comb_egs_dir/info/frames_per_eg` \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_n.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_n.sh new file mode 100755 index 00000000000..1b2614c660c --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_n.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# This script is same as _m, but does not use UNK LM. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7e # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1n # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix=bi_e +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width `cat $comb_egs_dir/info/frames_per_eg` \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_o.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_o.sh new file mode 100755 index 00000000000..82da071e0bc --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_o.sh @@ -0,0 +1,466 @@ +#!/bin/bash + +# This script is same as _k, but uses biphone tree. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1o # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width `cat $comb_egs_dir/info/frames_per_eg` \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_p.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_p.sh new file mode 100755 index 00000000000..db989b65eb5 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_p.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# This script is same as _k, but uses biphone tree. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1p # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,6 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width `cat $comb_egs_dir/info/frames_per_eg` \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_q.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_q.sh new file mode 100755 index 00000000000..0dc1ef33e03 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_q.sh @@ -0,0 +1,468 @@ +#!/bin/bash + +# This script is same as _p, but does not use phone UNK model +# Also the same as _n, but uses speed-perturbed data to get +# appropriate weights for phone LM. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7e # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1q # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix=bi_e +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width `cat $comb_egs_dir/info/frames_per_eg` \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_e.sh new file mode 100644 index 00000000000..70dc30bd331 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_e.sh @@ -0,0 +1,529 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_ex250k_1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_f.sh new file mode 100644 index 00000000000..91faa23cc6e --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_f.sh @@ -0,0 +1,525 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_ex250k_1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient 0.00001 \ + --chain.mmi-factor-schedule="output-0=1.0,1.0 output-1=1.0,1.0" \ + --chain.smbr-factor-schedule="output-0=0.0,0.0 output-1=0.0,0.0" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_smbr_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_smbr_a.sh new file mode 100644 index 00000000000..6f0a5f932fb --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_smbr_a.sh @@ -0,0 +1,523 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_ex250k_1e_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +train_extra_opts="--chain.mmi-factor-schedule=1.0,1.0@0.2,0.5@0.2,0.5 --chain.smbr-factor-schedule=0.0,0.0@0.2,0.2@0.2,0.2" +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + $train_extra_opts --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_smbr_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_smbr_b.sh new file mode 100644 index 00000000000..37362657651 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_smbr_b.sh @@ -0,0 +1,529 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +smbr_leaky_hmm_coefficient=0.00001 +mmi_factor_schedule="output-0=1.0,1.0 output-1=0.2,0.2" +smbr_factor_schedule="output-0=0.0,0.0 output-1=0.4,0.4" + +# Semi-supervised options +comb_affix=comb_250k_ex250k_1b_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ + --chain.mmi-factor-schedule="$mmi_factor_schedule" \ + --chain.smbr-factor-schedule="$smbr_factor_schedule" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..ebf52fa8b40 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_a.sh @@ -0,0 +1,513 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_a + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_c.sh new file mode 100644 index 00000000000..f41374e4593 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_c.sh @@ -0,0 +1,517 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_a + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1c # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=768 +cell_dim=768 +projection_dim=192 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_d.sh new file mode 100644 index 00000000000..f5f41fd67c1 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_d.sh @@ -0,0 +1,517 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7c # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1d # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=768 +cell_dim=768 +projection_dim=192 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_e.sh new file mode 100644 index 00000000000..61c55686efe --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_e.sh @@ -0,0 +1,516 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_f.sh new file mode 100644 index 00000000000..dfdc36d6428 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_f.sh @@ -0,0 +1,516 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7g # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_e + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true --constrained false \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true --constrained false $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_g.sh new file mode 100644 index 00000000000..9dcfb693eda --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_g.sh @@ -0,0 +1,516 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7f # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_f + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1g # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + $treedir ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true --constrained false \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true --constrained false $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_a.sh new file mode 100644 index 00000000000..23aa3531377 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_a.sh @@ -0,0 +1,512 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1b_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient 0.00001 \ + --chain.mmi-factor-schedule="output-0=1.0,1.0 output-1=0.5,0.5" \ + --chain.smbr-factor-schedule="output-0=0.0,0.0 output-1=0.2,0.2" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_b.sh new file mode 100644 index 00000000000..ce8bf87c87d --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_b.sh @@ -0,0 +1,437 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +smbr_leaky_hmm_coefficient=0.00001 +mmi_factor_schedule="output-0=1.0,1.0 output-1=0.2,0.2" +smbr_factor_schedule="output-0=0.0,0.0 output-1=0.4,0.4" + +# Semi-supervised options +comb_affix=comb_250k_1b_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ + --chain.mmi-factor-schedule="$mmi_factor_schedule" \ + --chain.smbr-factor-schedule="$smbr_factor_schedule" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_ex500k_semisupervised_smbr_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_ex500k_semisupervised_smbr_a.sh new file mode 100644 index 00000000000..ec5d2138730 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_ex500k_semisupervised_smbr_a.sh @@ -0,0 +1,531 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +smbr_leaky_hmm_coefficient=0.00001 +mmi_factor_schedule="output-0=1.0,1.0 output-1=0.2,0.2" +smbr_factor_schedule="output-0=0.0,0.0 output-1=0.4,0.4" + +# Semi-supervised options +comb_affix=comb_500k_ex500k_1a_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,1 +num_copies=2,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ + --chain.mmi-factor-schedule="$mmi_factor_schedule" \ + --chain.smbr-factor-schedule="$smbr_factor_schedule" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..72b3ef0cb25 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_a.sh @@ -0,0 +1,518 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_a + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_500k_1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +num_copies= +lm_weights=3,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=dnn2 dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=dnn4 dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=dnn6 dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false learning-rate-factor=$learning_rate_factor max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight "$supervision_weights" \ + --lang2num-copies "$num_copies" --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..905309bc5c7 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_b.sh @@ -0,0 +1,517 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_a + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_500k_1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +num_copies= +lm_weights=3,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false learning-rate-factor=$learning_rate_factor max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_c.sh new file mode 100644 index 00000000000..1e8ce3039a6 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_c.sh @@ -0,0 +1,513 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_a + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_500k_1c # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false learning-rate-factor=$learning_rate_factor max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_d.sh new file mode 100644 index 00000000000..910cac4f1d0 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_d.sh @@ -0,0 +1,509 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_500k_1d # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,1 +num_copies=2,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts=${decode_iter:+--iter $decode_iter} + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_kl_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_kl_a.sh new file mode 100644 index 00000000000..e2a94495332 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_kl_a.sh @@ -0,0 +1,442 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +mmi_factor_schedule="output-0=1.0,1.0 output-1=0.5,0.5" +kl_factor_schedule="output-0=0.0,0.0 output-1=0.5,0.5" + +# Semi-supervised options +comb_affix=comb_500k_1a_kl0.5 # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,1 +num_copies=2,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.mmi-factor-schedule="$mmi_factor_schedule" \ + --chain.kl-factor-schedule="$kl_factor_schedule" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_b.sh new file mode 100644 index 00000000000..281a6e4d88d --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_b.sh @@ -0,0 +1,518 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +smbr_leaky_hmm_coefficient=0.00001 +mmi_factor_schedule="output-0=1.0,1.0 output-1=0.2,0.2" +smbr_factor_schedule="output-0=0.0,0.0 output-1=0.4,0.4" + +# Semi-supervised options +comb_affix=comb_500k_1b_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,1 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ + --chain.mmi-factor-schedule="$mmi_factor_schedule" \ + --chain.smbr-factor-schedule="$smbr_factor_schedule" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_c.sh new file mode 100644 index 00000000000..f29b65c6e7b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_c.sh @@ -0,0 +1,446 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +smbr_leaky_hmm_coefficient=0.00001 +mmi_factor_schedule="output-0=1.0,1.0 output-1=0.2,0.2" +smbr_factor_schedule="output-0=0.0,0.0 output-1=0.4,0.4" + +# Semi-supervised options +comb_affix=comb_500k_1c_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,1 +num_copies=2,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ + --chain.mmi-factor-schedule="$mmi_factor_schedule" \ + --chain.smbr-factor-schedule="$smbr_factor_schedule" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_b.sh new file mode 100755 index 00000000000..e686d977ded --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_b.sh @@ -0,0 +1,225 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_a +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_c.sh new file mode 100755 index 00000000000..1854a4a86e1 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_c.sh @@ -0,0 +1,225 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7c +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_c +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_d.sh new file mode 100755 index 00000000000..265a8c05a11 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_d.sh @@ -0,0 +1,228 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7d +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_c +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_e.sh new file mode 100755 index 00000000000..05fe3a017e3 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_e.sh @@ -0,0 +1,228 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7e +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_e +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_f.sh new file mode 100755 index 00000000000..9bc98d90934 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_f.sh @@ -0,0 +1,228 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7f +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_e +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true --constrained false" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_g.sh new file mode 100755 index 00000000000..ff4e8d55efc --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_g.sh @@ -0,0 +1,228 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7g +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_e +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true --constrained false" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_b.sh new file mode 100755 index 00000000000..988299a4621 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_b.sh @@ -0,0 +1,244 @@ +#!/bin/bash +set -e +set -u + +# This is oracle experiment for semi-supervised training with 100 hours +# of supervised data and 250 hours of unsupervised data + +# configs for 'chain' +stage=0 +tdnn_affix=7b_oracle +train_stage=-10 +get_egs_stage=-10 +decode_iter= +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k_n10k +base_train_set=train_oracle100k_250k_n10k +tree_affix=bi_a +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ ! -f $treedir/final.mdl ]; then + echo "$0: Could not find $treedir/final.mdl" + exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_c.sh new file mode 100755 index 00000000000..b21dd72a37a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_c.sh @@ -0,0 +1,247 @@ +#!/bin/bash +set -e +set -u + +# This is oracle experiment for semi-supervised training with 100 hours +# of supervised data and 250 hours of unsupervised data + +# configs for 'chain' +stage=0 +tdnn_affix=7c_oracle +train_stage=-10 +get_egs_stage=-10 +decode_iter= +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k_n10k +base_train_set=train_oracle100k_250k_n10k +tree_affix=bi_a +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ ! -f $treedir/final.mdl ]; then + echo "$0: Could not find $treedir/final.mdl" + exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=dnn2 dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=dnn4 dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=dnn6 dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_d.sh new file mode 100755 index 00000000000..876633fedd6 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_d.sh @@ -0,0 +1,247 @@ +#!/bin/bash +set -e +set -u + +# This is oracle experiment for semi-supervised training with 100 hours +# of supervised data and 250 hours of unsupervised data + +# configs for 'chain' +stage=0 +tdnn_affix=7d_oracle +train_stage=-10 +get_egs_stage=-10 +decode_iter= +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k_n10k +base_train_set=train_oracle100k_250k_n10k +tree_affix=bi_a +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ ! -f $treedir/final.mdl ]; then + echo "$0: Could not find $treedir/final.mdl" + exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh new file mode 100755 index 00000000000..1806303f319 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh @@ -0,0 +1,233 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7smbr_a +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_c +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 +extra_opts="--chain.mmi-factor-schedule=1.0,1.0@0.1,0.5@0.2,0.5 --chain.smbr-factor-schedule=0.0,0.0@0.1,0.5@0.2,0.5" +chain_smbr_extra_opts= +smbr_leaky_hmm_coefficient=0.00001 +leaky_hmm_coefficient=0.1 +l2_regularize=0.0 # 00005 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient $leaky_hmm_coefficient \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ + --chain.l2-regularize $l2_regularize \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --cleanup.preserve-model-interval 10 \ + --dir $dir --lang $lang $extra_opts || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh.orig b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh.orig new file mode 100755 index 00000000000..a7505376a19 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh.orig @@ -0,0 +1,240 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7smbr_a +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_c +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 +extra_opts="--chain.mmi-factor-schedule=1.0,1.0@0.1,0.5@0.2,0.5 --chain.smbr-factor-schedule=0.0,0.0@0.1,0.5@0.2,0.5" +chain_smbr_extra_opts= +smbr_leaky_hmm_coefficient=0.00001 +leaky_hmm_coefficient=0.1 +<<<<<<< Updated upstream +l2_regularize=0.0 # 00005 +======= +>>>>>>> Stashed changes + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient $leaky_hmm_coefficient \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ +<<<<<<< Updated upstream + --chain.l2-regularize $l2_regularize \ +======= + --chain.l2-regularize 0.0 \ +>>>>>>> Stashed changes + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --cleanup.preserve-model-interval 10 \ + --dir $dir --lang $lang $extra_opts || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_a.sh new file mode 100755 index 00000000000..bf1e4878c8e --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_a.sh @@ -0,0 +1,232 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but uses a phone LM UNK model + +# configs for 'chain' +stage=0 +tdnn_affix=7a +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +unsupervised_set=train_unsup100_250k +semisup_train_set=semisup15k_250k +tree_affix=bi_i +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=10 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_b.sh new file mode 100755 index 00000000000..3c9ab27a353 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_b.sh @@ -0,0 +1,232 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but uses a phone LM UNK model + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +unsupervised_set=train_unsup100_250k +semisup_train_set=semisup15k_250k +tree_affix=bi_j +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=10 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_oracle_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_oracle_a.sh new file mode 100755 index 00000000000..997a17a5329 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_oracle_a.sh @@ -0,0 +1,244 @@ +#!/bin/bash +set -e +set -u + +# This is oracle experiment for semi-supervised training with 100 hours +# of supervised data and 250 hours of unsupervised data + +# configs for 'chain' +stage=0 +tdnn_affix=7a_oracle +train_stage=-10 +get_egs_stage=-10 +decode_iter= +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k_n10k +base_train_set=train_oracle100k_250k_n10k +tree_affix=bi_i +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ ! -f $treedir/final.mdl ]; then + echo "$0: Could not find $treedir/final.mdl" + exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..145b4c0e178 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_a.sh @@ -0,0 +1,507 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_15k + +unsupervised_set=train_unsup100k_250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semisup_train_set=semisup15k_250k + +tdnn_affix=7a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_i + +nnet3_affix=_semi15k_250k # affix for nnet3 and chain dir -- relates to i-vector used +chain_affix=_semi15k_250k + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${chain_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..681a46212c9 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_b.sh @@ -0,0 +1,509 @@ +#!/bin/bash + +# This script uses phone LM to model UNK. + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_15k + +unsupervised_set=train_unsup100k_250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semisup_train_set=semisup15k_250k + +tdnn_affix=7a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_i + +nnet3_affix=_semi15k_250k # affix for nnet3 and chain dir -- relates to i-vector used +chain_affix=_semi15k_250k + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${chain_affix}/tri3_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_c.sh new file mode 100644 index 00000000000..01c0191be83 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_c.sh @@ -0,0 +1,507 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_15k + +unsupervised_set=train_unsup100k_250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semisup_train_set=semisup15k_250k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_j + +nnet3_affix=_semi15k_250k # affix for nnet3 and chain dir -- relates to i-vector used +chain_affix=_semi15k_250k + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1c # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${chain_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_smbr_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_smbr_a.sh new file mode 100755 index 00000000000..aff735560e0 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_smbr_a.sh @@ -0,0 +1,232 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but uses a phone LM UNK model + +# configs for 'chain' +stage=0 +tdnn_affix=7smbr_a +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +unsupervised_set=train_unsup100_250k +semisup_train_set=semisup15k_250k +tree_affix=bi_i +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=10 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 +extra_opts="--chain.mmi-factor-schedule=1.0,1.0@0.1,0.5@0.2,0.5 --chain.smbr-factor-schedule=0.0,0.0@0.1,0.5@0.2,0.5" +chain_smbr_extra_opts= + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --dir $dir --lang $lang $extra_opts || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_a.sh new file mode 100755 index 00000000000..6bafc30f3aa --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_a.sh @@ -0,0 +1,232 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but uses a phone LM UNK model + +# configs for 'chain' +stage=0 +tdnn_affix=7a +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup50k +unsupervised_set=train_unsup100_250k +semisup_train_set=semisup50k_250k +tree_affix=bi_i +nnet3_affix=_semi50k_250k +chain_affix=_semi50k_250k +exp=exp/semisup_50k +gmm=tri4a +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=10 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_b.sh new file mode 100755 index 00000000000..aa0387cc1d4 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_b.sh @@ -0,0 +1,232 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but uses a phone LM UNK model + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup50k +unsupervised_set=train_unsup100_250k +semisup_train_set=semisup50k_250k +tree_affix=bi_j +nnet3_affix=_semi50k_250k +chain_affix=_semi50k_250k +exp=exp/semisup_50k +gmm=tri4a +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=10 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..bab9e69bbf3 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_semisupervised_conf_a.sh @@ -0,0 +1,511 @@ +#!/bin/bash + +# This script uses phone LM to model UNK. + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_50k + +unsupervised_set=train_unsup100k_250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semisup_train_set=semisup50k_250k + +tdnn_affix=7a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_i + +gmm=tri4a + +nnet3_affix=_semi50k_250k # affix for nnet3 and chain dir -- relates to i-vector used +chain_affix=_semi50k_250k + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${chain_affix}/${gmm}_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..ebd6c090267 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_semisupervised_conf_b.sh @@ -0,0 +1,511 @@ +#!/bin/bash + +# This script uses phone LM to model UNK. + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_50k + +unsupervised_set=train_unsup100k_250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semisup_train_set=semisup50k_250k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_j + +gmm=tri4a + +nnet3_affix=_semi50k_250k # affix for nnet3 and chain dir -- relates to i-vector used +chain_affix=_semi50k_250k + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${chain_affix}/${gmm}_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_oracle.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_oracle.sh new file mode 100755 index 00000000000..7b6f1716247 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_oracle.sh @@ -0,0 +1,199 @@ +#!/bin/bash +set -e + +# Based on run_tdnn_7b.sh in the fisher swbd recipe + +# configs for 'chain' +stage=0 +tdnn_affix=7b_oracle +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup11k +ivector_train_set=semisup11k_250k +tree_affix= +nnet3_affix=_semi11k_250k +chain_affix=_semi11k_250k +exp=exp/semisup_11k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +ali_dir=${gmm_dir}_ali_${train_set} +if [ $stage -le 11 ]; then + steps/align_fmllr.sh --cmd "$train_cmd" --nj 40 \ + data/${train_set} data/lang $gmm_dir $ali_dir || exit 1 + + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 11000 data/${train_set} $lang $ali_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common.sh b/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..718d1aaed04 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common.sh @@ -0,0 +1,140 @@ +#!/bin/bash + +. ./cmd.sh +set -e +stage=-1 +speed_perturb=true +train_set=train + +unsup_train_set= +semisup_train_set= + +nnet3_affix= +exp=exp + +. ./path.sh +. ./utils/parse_options.sh + +if [ ! -z "$unsup_train_set" ] && [ -z "$semisup_train_set" ]; then + echo "$0: --semisup-train-set must be provided if --unsup-train-set is provided" + exit 1 +fi + +if [ -z "$unsup_train_set" ] && [ ! -z "$semisup_train_set" ]; then + echo "$0: --unsup-train-set must be provided if --semisup-train-set is provided" + exit 1 +fi + +if [ ! -z "$unsup_train_set" ]; then + if [ $stage -le 0 ]; then + utils/combine_data.sh data/$semisup_train_set \ + data/$train_set data/$unsup_train_set + fi +fi + +# perturbed data preparation +if [ "$speed_perturb" == "true" ]; then + if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have + # to perturb the normal data to get the alignments. + # _sp stands for speed-perturbed + + for datadir in ${train_set} ${unsup_train_set}; do + utils/data/perturb_data_dir_speed_3way.sh data/${datadir} data/${datadir}_sp + utils/fix_data_dir.sh data/${datadir}_sp + + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ + data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_sp + done + fi +fi + +if [ ! -z "$unsup_train_set" ]; then + if [ $stage -le 2 ]; then + utils/combine_data.sh data/${semisup_train_set}_sp \ + data/${train_set}_sp data/${unsup_train_set}_sp + fi +fi + +if [ $stage -le 3 ]; then + mfccdir=mfcc_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5b/$mfccdir/storage $mfccdir/storage + fi + + for dataset in $train_set $unsup_train_set; do + utils/copy_data_dir.sh data/${dataset}_sp data/${dataset}_sp_hires + utils/data/perturb_data_dir_volume.sh data/${dataset}_sp_hires + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${dataset}_sp_hires exp/make_hires/${dataset}_sp $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_sp_hires exp/make_hires/${dataset}_sp $mfccdir; + + # Remove the small number of utterances that couldn't be extracted for some + # reason (e.g. too short; no such file). + utils/fix_data_dir.sh data/${dataset}_sp_hires; + done + + for dataset in test dev; do + # Create MFCCs for the eval set + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \ + data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems + done +fi + +ivector_train_set=${train_set}_sp +if [ ! -z "$unsup_train_set" ]; then + if [ $stage -le 3 ]; then + utils/combine_data.sh data/${semisup_train_set}_sp_hires \ + data/${train_set}_sp_hires data/${unsup_train_set}_sp_hires + fi + ivector_train_set=${semisup_train_set}_sp +fi + +# ivector extractor training +if [ $stage -le 4 ]; then + steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + --max-utts 10000 --subsample 2 \ + data/${ivector_train_set}_hires \ + $exp/nnet3${nnet3_affix}/pca_transform +fi + +if [ $stage -le 5 ]; then + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${ivector_train_set}_hires 512 \ + $exp/nnet3${nnet3_affix}/pca_transform $exp/nnet3${nnet3_affix}/diag_ubm +fi + +if [ $stage -le 6 ]; then + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${ivector_train_set}_hires $exp/nnet3${nnet3_affix}/diag_ubm $exp/nnet3${nnet3_affix}/extractor || exit 1; +fi + +if [ $stage -le 7 ]; then + # We extract iVectors on all the ${train_set} data, which will be what we + # train the system on. + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${ivector_train_set}_hires data/${ivector_train_set}_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${ivector_train_set}_max2_hires $exp/nnet3${nnet3_affix}/extractor $exp/nnet3${nnet3_affix}/ivectors_${ivector_train_set}_hires || exit 1; +fi + +if [ $stage -le 8 ]; then + for dataset in test dev; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${dataset}_hires $exp/nnet3${nnet3_affix}/extractor $exp/nnet3${nnet3_affix}/ivectors_${dataset}_hires || exit 1; + done +fi + +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/run_10k.sh b/egs/fisher_english/s5/local/semisup/run_10k.sh new file mode 100644 index 00000000000..b91c67cb711 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/run_10k.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +. cmd.sh +. path.sh + +stage=-1 +train_stage=-10 + +. utils/parse_options.sh + +set -o pipefail +exp=exp/semisup_11k + +for f in data/train_sup/utt2spk data/train_unsup250k/utt2spk ]; do + if [ ! -f $f ]; then + echo "$0: Could not find $f" + exit 1 + fi +done + +utils/subset_data_dir.sh --speakers data/train_sup 11000 data/train_sup11k || exit 1 +utils/subset_data_dir.sh --shortest data/train_sup11k 5000 data/train_sup11k_short || exit 1 +utils/subset_data_dir.sh data/train_sup11k 5500 data/train_sup11k_half || exit 1 + +steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup11k_short data/lang $exp/mono0a || exit 1 + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup11k_half data/lang $exp/mono0a $exp/mono0a_ali || exit 1 + +steps/train_deltas.sh --cmd "$train_cmd" \ + 2000 10000 data/train_sup11k_half data/lang $exp/mono0a_ali $exp/tri1 || exit 1 + +(utils/mkgraph.sh data/lang_test $exp/tri1 $exp/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri1/graph data/dev $exp/tri1/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup11k data/lang $exp/tri1 $exp/tri1_ali || exit 1; + +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup11k data/lang $exp/tri1_ali $exp/tri2 || exit 1; + +(utils/mkgraph.sh data/lang_test $exp/tri2 $exp/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri2/graph data/dev $exp/tri2/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup11k data/lang $exp/tri2 $exp/tri2_ali || exit 1; + +steps/train_sat.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup11k data/lang $exp/tri2_ali $exp/tri3 || exit 1; + +( + utils/mkgraph.sh data/lang_test $exp/tri3 $exp/tri3/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri3/graph data/dev $exp/tri3/decode_dev +)& + +utils/combine_data.sh data/semisup11k_250k data/train_sup11k data/train_unsup250k || exit 1 + +mkdir -p data/local/pocolm_ex250k + +utils/filter_scp.pl --exclude data/train_unsup250k/utt2spk \ + data/train/text > data/local/pocolm_ex250k/text.tmp + +local/fisher_train_lms_pocolm.sh \ + --text data/local/pocolm_ex250k/text.tmp \ + --dir data/local/pocolm_ex250k + +local/fisher_create_test_lang.sh \ + --arpa-lm data/local/pocolm_ex250k/data/arpa/4gram_small.arpa.gz \ + --dir data/lang_test_poco_ex250k + +utils/build_const_arpa_lm.sh \ + data/local/pocolm_ex250k/data/arpa/4gram_big.arpa.gz \ + data/lang_test_poco_ex250k data/lang_test_poco_ex250k_big + +local/semisup/chain/tuning/run_tdnn_11k.sh \ + --ivector-train-set semisup11k_250k --train-set train_sup11k --stage $stage --train-stage $train_stage || exit 1 diff --git a/egs/fisher_english/s5/local/semisup/run_15k.sh b/egs/fisher_english/s5/local/semisup/run_15k.sh new file mode 100644 index 00000000000..f64ea6221c0 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/run_15k.sh @@ -0,0 +1,101 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +. cmd.sh +. path.sh + +stage=-1 +train_stage=-10 + +. utils/parse_options.sh + +set -o pipefail +exp=exp/semisup_15k + +for f in data/train_sup/utt2spk data/train_unsup250k/utt2spk ]; do + if [ ! -f $f ]; then + echo "$0: Could not find $f" + exit 1 + fi +done + +utils/subset_data_dir.sh --speakers data/train_sup 15000 data/train_sup15k || exit 1 +utils/subset_data_dir.sh --shortest data/train_sup15k 5000 data/train_sup15k_short || exit 1 +utils/subset_data_dir.sh data/train_sup15k 7500 data/train_sup15k_half || exit 1 + +steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup15k_short data/lang $exp/mono0a || exit 1 + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup15k_half data/lang $exp/mono0a $exp/mono0a_ali || exit 1 + +steps/train_deltas.sh --cmd "$train_cmd" \ + 2000 10000 data/train_sup15k_half data/lang $exp/mono0a_ali $exp/tri1 || exit 1 + +(utils/mkgraph.sh data/lang_test $exp/tri1 $exp/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri1/graph data/dev $exp/tri1/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup15k data/lang $exp/tri1 $exp/tri1_ali || exit 1; + +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup15k data/lang $exp/tri1_ali $exp/tri2 || exit 1; + +(utils/mkgraph.sh data/lang_test $exp/tri2 $exp/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri2/graph data/dev $exp/tri2/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup15k data/lang $exp/tri2 $exp/tri2_ali || exit 1; + +steps/train_sat.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup15k data/lang $exp/tri2_ali $exp/tri3 || exit 1; + +( + utils/mkgraph.sh data/lang_test $exp/tri3 $exp/tri3/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri3/graph data/dev $exp/tri3/decode_dev +)& + +utils/combine_data.sh data/semisup15k_250k data/train_sup15k data/train_unsup250k || exit 1 + +mkdir -p data/local/pocolm_ex250k + +utils/filter_scp.pl --exclude data/train_unsup250k/utt2spk \ + data/train/text > data/local/pocolm_ex250k/text.tmp + +if [ ! -f data/lang_test_poco_ex250k_big/G.carpa ]; then + local/fisher_train_lms_pocolm.sh \ + --text data/local/pocolm_ex250k/text.tmp \ + --dir data/local/pocolm_ex250k + + local/fisher_create_test_lang.sh \ + --arpa-lm data/local/pocolm_ex250k/data/arpa/4gram_small.arpa.gz \ + --dir data/lang_test_poco_ex250k + + utils/build_const_arpa_lm.sh \ + data/local/pocolm_ex250k/data/arpa/4gram_big.arpa.gz \ + data/lang_test_poco_ex250k data/lang_test_poco_ex250k_big +fi + +local/run_unk_model.sh --lang-dirs "data/lang_test_poco_ex250k_big data/lang_test_poco_ex250k" || exit 1 + +local/semisup/chain/tuning/run_tdnn_11k.sh \ + --train-set train_sup15k \ + --nnet3-affix _semi15k_250k \ + --chain-affix _semi15k_250k \ + --stage $stage --train-stage $train_stage \ + --exp $exp \ + --ivector-train-set semisup15k_250k || exit 1 + +local/semisup/chain/tuning/run_tdnn_oracle.sh \ + --train-set semisup15k_250k \ + --nnet3-affix _semi15k_250k \ + --chain-affix _semi15k_250k_oracle \ + --gmm tri3 \ + --stage 9 --train-stage $train_stage \ + --exp $exp \ + --ivector-train-set semisup15k_250k || exit 1 diff --git a/egs/fisher_english/s5/local/semisup/run_20k.sh b/egs/fisher_english/s5/local/semisup/run_20k.sh new file mode 100644 index 00000000000..9af463cd7a2 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/run_20k.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +. cmd.sh +. path.sh + +stage=-1 +train_stage=-10 + +. utils/parse_options.sh + +set -o pipefail +exp=exp/semisup_20k + +utils/subset_data_dir.sh --speakers data/train_sup 20000 data/train_sup20k || exit 1 +utils/subset_data_dir.sh --shortest data/train_sup20k 5000 data/train_sup20k_short || exit 1 +utils/subset_data_dir.sh data/train_sup20k 10000 data/train_sup20k_half || exit 1 + +steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup20k_short data/lang $exp/mono0a || exit 1 + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup20k_half data/lang $exp/mono0a $exp/mono0a_ali || exit 1 + +steps/train_deltas.sh --cmd "$train_cmd" \ + 2000 10000 data/train_sup20k_half data/lang $exp/mono0a_ali $exp/tri1 || exit 1 + +(utils/mkgraph.sh data/lang_test $exp/tri1 $exp/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri1/graph data/dev $exp/tri1/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup20k data/lang $exp/tri1 $exp/tri1_ali || exit 1; + +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup20k data/lang $exp/tri1_ali $exp/tri2 || exit 1; + +(utils/mkgraph.sh data/lang_test $exp/tri2 $exp/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri2/graph data/dev $exp/tri2/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup20k data/lang $exp/tri2 $exp/tri2_ali || exit 1; + +steps/train_sat.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup20k data/lang $exp/tri2_ali $exp/tri3 || exit 1; + +( + utils/mkgraph.sh data/lang_test $exp/tri3 $exp/tri3/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri3/graph data/dev $exp/tri3/decode_dev +)& + +utils/combine_data.sh data/semisup20k_250k data/train_sup20k data/train_unsup250k || exit 1 + +local/semisup/chain/tuning/run_tdnn_11k.sh \ + --train-set train_sup20k \ + --nnet3-affix _semi20k_250k \ + --chain-affix _semi20k_250k \ + --stage $stage --train-stage $train_stage \ + --exp $exp \ + --ivector-train-set semisup20k_250k || exit 1 diff --git a/egs/fisher_english/s5/local/tfrnnlm/rnnlm_data_prep.sh b/egs/fisher_english/s5/local/tfrnnlm/rnnlm_data_prep.sh new file mode 100755 index 00000000000..8d2a36a94c8 --- /dev/null +++ b/egs/fisher_english/s5/local/tfrnnlm/rnnlm_data_prep.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +# This script prepares the data directory used for TensorFlow based RNNLM traiing +# it prepares the following files in the output-directory +# 1. $dir/wordlist.rnn.final : wordlist for RNNLM +# format of this file is like the following: +# 0 The +# 1 a +# 2 is +# .... +# note that we don't reserve the 0 id for any special symbol +# 2. $dir/{train/valid} : the text files, with each sentence in a line + +# 3. $dir/unk.probs : this file provides information for distributing OOS probs +# among all the OOS words, in rnnlm-rescoring. If provided, the +# probability for would be porportionally distributed among all OOS words +# +# It is called unk.probs to be consistent with rnnlm-rescoring scripts with +# Mikolov's and Yandex's toolkits, but you could simply provide the count instead, as +# the binary would auto-normalize the counts into probabilities +# the format of this file is like the following: +# some-rare-word-1 0.0003 +# some-rare-word-2 0.0004 +# ... + +set -e + +train_text=data/train/text +nwords=9999 +heldout_sent=10000 + +. path.sh +. cmd.sh + +. utils/parse_options.sh + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo "For details of what the script does, see top of script file" + exit 1; +fi + +dir=$1 +srcdir=data/local/dict + +mkdir -p $dir + +cat $srcdir/lexicon.txt | awk '{print $1}' | sort -u | grep -v -w '!SIL' > $dir/wordlist.all + +# Get training data with OOV words (w.r.t. our current vocab) replaced with , +# as well as adding symbols at the end of each sentence +cat $train_text | awk -v w=$dir/wordlist.all \ + 'BEGIN{while((getline0) v[$1]=1;} + {for (i=2;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}' | sed 's=$= =g' \ + | utils/shuffle_list.pl | gzip -c > $dir/all.gz + +echo "Splitting data into train and validation sets." + +gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data +gunzip -c $dir/all.gz | tail -n +$heldout_sent > $dir/train.in # training data + + +cat $dir/train.in $dir/wordlist.all | \ + awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \ + sort -nr > $dir/unigram.counts + +total_nwords=`wc -l <$dir/unigram.counts` + +# the wordlist.rnn file is just a wordlist - i.e. with a word on each lien +# wordlist.rnn.id has [word-id] [word] on each line, with [word-id] being consecutive integers +# this will not be the final wordlist we use because we need to add symbol +head -$nwords $dir/unigram.counts | awk '{print $2}' | tee $dir/wordlist.rnn | awk '{print NR-1, $1}' > $dir/wordlist.rnn.id +tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts + +for type in train valid; do + # replacing every word that does not appear in the worlist.rnn file with a symbol + cat $dir/$type.in | awk -v w=$dir/wordlist.rnn 'BEGIN{while((getline0)d[$1]=1}{for(i=1;i<=NF;i++){if(d[$i]==1){s=$i}else{s=""} printf("%s ",s)} print""}' > $dir/$type +done + +cat $dir/unk_class.counts | awk '{print $2, $1}' > $dir/unk.probs +cp $dir/wordlist.rnn $dir/wordlist.rnn.final + +if ! grep -w '' $dir/wordlist.rnn.final >/dev/null; then + echo "" >> $dir/wordlist.rnn.final +fi + +echo "data preparation finished" diff --git a/egs/fisher_english/s5/local/tfrnnlm/run_lstm_fast.sh b/egs/fisher_english/s5/local/tfrnnlm/run_lstm_fast.sh new file mode 100755 index 00000000000..6328bfd11dc --- /dev/null +++ b/egs/fisher_english/s5/local/tfrnnlm/run_lstm_fast.sh @@ -0,0 +1,58 @@ +#!/bin/bash +ngram_order=3 # this option when used, the rescoring binary makes an approximation + # to merge the states of the FST generated from RNNLM. e.g. if ngram-order = 4 + # then any history that shares last 3 words would be merged into one state +stage=1 +weight=0.5 # when we do lattice-rescoring, instead of replacing the lm-weights + # in the lattice with RNNLM weights, we usually do a linear combination of + # the 2 and the $weight variable indicates the weight for the RNNLM scores + +train_text=data/train/text +nwords=9999 +opts= +dir=data/tensorflow_fast_lstm + +. ./utils/parse_options.sh +. ./cmd.sh +. ./path.sh + +set -e + +mkdir -p $dir + +#steps/tfrnnlm/check_tensorflow_installed.sh + +if [ $stage -le 1 ]; then + local/tfrnnlm/rnnlm_data_prep.sh --train-text $train_text --nwords $nwords $dir +fi + +mkdir -p $dir +if [ $stage -le 2 ]; then +# the following script uses TensorFlow. You could use tools/extras/install_tensorflow_py.sh to install it + $train_cmd --gpu 1 --mem 20G $dir/train_rnnlm.log utils/parallel/limit_num_gpus.sh \ + python steps/tfrnnlm/lstm_fast.py --data-path=$dir --save-path=$dir/rnnlm --vocab-path=$dir/wordlist.rnn.final ${opts} +fi + +exit 0 + +final_lm=ami_fsh.o3g.kn +LM=$final_lm.pr1-7 + +if [ $stage -le 3 ]; then +# for decode_set in dev; do + for decode_set in dev eval; do + basedir=exp/$mic/nnet3/tdnn_sp/ + decode_dir=${basedir}/decode_${decode_set} + + # Lattice rescoring + steps/lmrescore_rnnlm_lat.sh \ + --cmd "$tfrnnlm_cmd --mem 16G" \ + --rnnlm-ver tensorflow --weight $weight --max-ngram-order $ngram_order \ + data/lang_$LM $dir \ + data/$mic/${decode_set}_hires ${decode_dir} \ + ${decode_dir}.unk.fast.tfrnnlm.lat.${ngram_order}gram.$weight & + + done +fi + +wait diff --git a/egs/fisher_english/s5/local/tfrnnlm/run_vanilla_rnnlm.sh b/egs/fisher_english/s5/local/tfrnnlm/run_vanilla_rnnlm.sh new file mode 100755 index 00000000000..27081b1b26f --- /dev/null +++ b/egs/fisher_english/s5/local/tfrnnlm/run_vanilla_rnnlm.sh @@ -0,0 +1,57 @@ +#!/bin/bash +ngram_order=4 # this option when used, the rescoring binary makes an approximation + # to merge the states of the FST generated from RNNLM. e.g. if ngram-order = 4 + # then any history that shares last 3 words would be merged into one state +stage=1 +weight=0.5 # when we do lattice-rescoring, instead of replacing the lm-weights + # in the lattice with RNNLM weights, we usually do a linear combination of + # the 2 and the $weight variable indicates the weight for the RNNLM scores + +train_text=data/train/text +nwords=9999 +opts= +dir=data/vanilla_tensorflow + +. ./utils/parse_options.sh +. ./cmd.sh +. ./path.sh + +set -e + +mkdir -p $dir + +#steps/tfrnnlm/check_tensorflow_installed.sh + +if [ $stage -le 1 ]; then + local/tfrnnlm/rnnlm_data_prep.sh --train-text $train_text --nwords $nwords $dir +fi + +mkdir -p $dir +if [ $stage -le 2 ]; then +# the following script uses TensorFlow. You could use tools/extras/install_tensorflow_py.sh to install it + $train_cmd --gpu 1 --mem 20G $dir/train_rnnlm.log utils/parallel/limit_num_gpus.sh \ + python steps/tfrnnlm/vanilla_rnnlm.py --data-path=$dir --save-path=$dir/rnnlm --vocab-path=$dir/wordlist.rnn.final ${opts} +fi + +exit 0 + +final_lm=ami_fsh.o3g.kn +LM=$final_lm.pr1-7 + +if [ $stage -le 3 ]; then + for decode_set in dev eval; do + basedir=exp/$mic/nnet3/tdnn_sp/ + decode_dir=${basedir}/decode_${decode_set} + + # Lattice rescoring + steps/lmrescore_rnnlm_lat.sh \ + --cmd "$tfrnnlm_cmd --mem 16G" \ + --rnnlm-ver tensorflow --weight $weight --max-ngram-order $ngram_order \ + data/lang_$LM $dir \ + data/$mic/${decode_set}_hires ${decode_dir} \ + ${decode_dir}.vanilla.tfrnnlm.lat.${ngram_order}gram.$weight & + + done +fi + +wait diff --git a/egs/fisher_english/s5/path.sh b/egs/fisher_english/s5/path.sh index 1a6fb5f891b..84fff2ad735 100755 --- a/egs/fisher_english/s5/path.sh +++ b/egs/fisher_english/s5/path.sh @@ -2,4 +2,8 @@ export KALDI_ROOT=`pwd`/../../.. export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh +export PYTHONPATH=${PYTHONPATH:+$PYTHONPATH:}$KALDI_ROOT/tools/tensorflow_build/.local/lib/python2.7/site-packages +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$KALDI_ROOT/tools/tensorflow/bazel-bin/tensorflow/:/usr/local/cuda/lib64:/export/a11/hlyu/cudnn/lib64:/home/dpovey/libs/ export LC_ALL=C +. /etc/profile.d/modules.sh +module load shared cuda80/toolkit diff --git a/egs/multi_en/s5/local/make_mx6_calls.pl b/egs/multi_en/s5/local/make_mx6_calls.pl new file mode 100755 index 00000000000..ed9d6375248 --- /dev/null +++ b/egs/multi_en/s5/local/make_mx6_calls.pl @@ -0,0 +1,105 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# Copyright 2017 David Snyder +# Apache 2.0 +# +# Prepares the telephone portion of Mixer 6 (LDC2013S03). + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2013S03 data/\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (! -d "$db_base/mx6_speech/data/ulaw_sphere/") { + print STDERR "Directory $db_base/mx6_speech/data/ulaw_sphere/ doesn't exist\n"; + exit(1); +} + +$out_dir = "$out_dir/mx6_calls"; + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("mkdir -p $out_dir") != 0) { + print STDERR "Error making directory $out_dir\n"; + exit(1); +} + +%call2sph = (); +open(SUBJECTS, "<$db_base/mx6_speech/docs/mx6_subjs.csv") || die "cannot open $$db_base/mx6_speech/docs/mx6_subjs.csv"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; +open(META, "<$db_base/mx6_speech/docs/mx6_calls.csv") || die "cannot open $db_base/mx6_speech/docs/mx6_calls.csv"; + +if (system("find $db_base/mx6_speech/data/ulaw_sphere/ -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} +open(SPHLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; + +while() { + chomp; + $sph = $_; + @toks = split("/",$sph); + $sph_id = (split("[./]",$toks[$#toks]))[0]; + $call_id = (split("_", $sph_id))[2]; + $call2sph[$call_id] = $sph; +} + +while () { + chomp; + $line = $_; + @toks = split(",", $line); + $spk = $toks[0]; + $gender = lc $toks[1]; + if ($gender eq "f" or $gender eq "m") { + print GNDR "$spk $gender\n"; + } +} + +$num_good_files = 0; +$num_bad_files = 0; +while () { + chomp; + $line = $_; + @toks = split(",", $line); + $call_id = $toks[0]; + ($call_date, $call_time) = split(/_/, $toks[1]); + $sid_A = $toks[4]; + $sid_B = $toks[12]; + if (-f $call2sph[$call_id]) { + $utt_A = "${sid_A}_MX6_${call_id}_A"; + $utt_B = "${sid_B}_MX6_${call_id}_B"; + print SPKR "${utt_A} $sid_A\n"; + print SPKR "${utt_B} $sid_B\n"; + print WAV "${utt_A} sph2pipe -f wav -p -c 1 $call2sph[$call_id] |\n"; + print WAV "${utt_B} sph2pipe -f wav -p -c 2 $call2sph[$call_id] |\n"; + $num_good_files++; + } else { + print STDERR "Sphere file for $call_id doesn't exist\n"; + $num_bad_files++; + } +} + +print STDERR "Processed $num_good_files utterances; $num_bad_files had missing sphere data.\n"; + +close(SPHLIST) || die; +close(SUBJECTS) || die; +close(GNDR) || die; +close(SPKR) || die; +close(WAV) || die; +close(META) || die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} + +system("utils/fix_data_dir.sh $out_dir"); +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/make_mx6_mic.pl b/egs/multi_en/s5/local/make_mx6_mic.pl new file mode 100755 index 00000000000..f021140f235 --- /dev/null +++ b/egs/multi_en/s5/local/make_mx6_mic.pl @@ -0,0 +1,96 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# Copyright 2017 David Snyder +# Apache 2.0 +# Prepares Mixer 6 (LDC2013S03) speech from a specified microphone and +# downsamples it to 8k. + +if (@ARGV != 3) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2013S03 02 data/\n"; + exit(1); +} +($db_base, $ch, $out_dir) = @ARGV; + +@bad_channels = ("01", "03", "14"); +if (/$ch/i ~~ @bad_channels) { + print STDERR "Bad channel $ch\n"; + exit(1); +} + +if (! -d "$db_base/mx6_speech/data/pcm_flac/CH$ch/") { + print STDERR "Directory $db_base/mx6_speech/data/pcm_flac/CH$ch/ doesn't exist\n"; + exit(1); +} + +$out_dir = "$out_dir/mx6_mic_$ch"; +if (system("mkdir -p $out_dir")) { + print STDERR "Error making directory $out_dir\n"; + exit(1); +} + +if (system("mkdir -p $out_dir") != 0) { + print STDERR "Error making directory $out_dir\n"; + exit(1); +} + +open(SUBJECTS, "<$db_base/mx6_speech/docs/mx6_subjs.csv") || die "cannot open $$db_base/mx6_speech/docs/mx6_subjs.csv"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; +open(META, "<$db_base/mx6_speech/docs/mx6_ivcomponents.csv") || die "cannot open $db_base/mx6_speech/docs/mx6_ivcomponents.csv"; + +while () { + chomp; + $line = $_; + @toks = split(",", $line); + $spk = $toks[0]; + $gender = lc $toks[1]; + if ($gender eq "f" or $gender eq "m") { + print GNDR "$spk $gender\n"; + } +} + +$num_good_files = 0; +$num_bad_files = 0; +while () { + chomp; + $line = $_; + @toks = split(",", $line); + $flac = "$db_base/mx6_speech/data/pcm_flac/CH$ch/$toks[0]_CH$ch.flac"; + $t1 = $toks[7]; + $t2 = $toks[8]; + @toks2 = split(/_/, $toks[0]); + if (-f $flac) { + if ($t2 - $t1 < 0.01) { # recordings with errors have 0 as the time stamps + $num_bad_files++; + next; + } + $spk = $toks2[3]; + $utt = "${spk}_MX6_$toks2[0]_$toks2[1]_$ch"; + print SPKR "${utt} $spk\n"; + print WAV "${utt} sox -t flac $flac -r 8k -t wav - trim $t1 =$t2 |\n"; + $num_good_files++; + } else { + print STDERR "File $flac doesn't exist\n"; + $num_bad_files++; + } +} + +print STDERR "Processed $num_good_files utterances; $num_bad_files had missing flac data.\n"; + +close(SUBJECTS) || die; +close(GNDR) || die; +close(SPKR) || die; +close(WAV) || die; +close(META) || die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} + +system("utils/fix_data_dir.sh $out_dir"); +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/make_sre.pl b/egs/multi_en/s5/local/make_sre.pl new file mode 100755 index 00000000000..d6e1abf94b0 --- /dev/null +++ b/egs/multi_en/s5/local/make_sre.pl @@ -0,0 +1,75 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2015 David Snyder +# Apache 2.0. +# Usage: make_sre.pl + +if (@ARGV != 4) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2006S44 sre2004 sre_ref data/sre2004\n"; + exit(1); +} + +($db_base, $sre_year, $sre_ref_filename, $out_dir) = @ARGV; +%utt2sph = (); +%spk2gender = (); + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("find -L $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; + +while() { + chomp; + $sph = $_; + @A1 = split("/",$sph); + @A2 = split("[./]",$A1[$#A1]); + $uttId=$A2[0]; + $utt2sph{$uttId} = $sph; +} + +open(GNDR,">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; +open(SPKR,">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV,">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(SRE_REF, "<$sre_ref_filename") or die "Cannot open SRE reference."; +while () { + chomp; + ($speaker, $gender, $other_sre_year, $utt_id, $channel) = split(" ", $_); + $channel_num = "1"; + if ($channel eq "A") { + $channel_num = "1"; + } else { + $channel_num = "2"; + } + $channel = lc $channel; + if (($other_sre_year eq "sre20$sre_year") and (exists $utt2sph{$utt_id})) { + $full_utt_id = "$speaker-sre$sre_year-$utt_id-$channel"; + $spk2gender{"$speaker"} = $gender; + print WAV "$full_utt_id"," sph2pipe -f wav -p -c $channel_num $utt2sph{$utt_id} |\n"; + print SPKR "$full_utt_id $speaker","\n"; + } +} +foreach $speaker (keys %spk2gender) { + print GNDR "$speaker $spk2gender{$speaker}\n"; +} + +close(GNDR) || die; +close(SPKR) || die; +close(WAV) || die; +close(SRE_REF) || die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} + +system("utils/fix_data_dir.sh $out_dir"); +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/make_swbd2_phase1.pl b/egs/multi_en/s5/local/make_swbd2_phase1.pl new file mode 100755 index 00000000000..71b26b55de5 --- /dev/null +++ b/egs/multi_en/s5/local/make_swbd2_phase1.pl @@ -0,0 +1,106 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2017 David Snyder +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora3/LDC/LDC98S75 data/swbd2_phase1_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/doc/callstat.tbl") || die "Could not open $db_base/doc/callstat.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio = ("3", "4"); + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} + +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; + +%wavs = (); +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $uttId = $t1[0]; + $wavs{$uttId} = $sph; +} + +while () { + $line = $_ ; + @A = split(",", $line); + @A1 = split("[./]",$A[0]); + $wav = $A1[0]; + if (/$wav/i ~~ @badAudio) { + # do nothing + print "Bad Audio = $wav"; + } else { + $spkr1= "sw_" . $A[2]; + $spkr2= "sw_" . $A[3]; + $gender1 = $A[5]; + $gender2 = $A[6]; + if ($gender1 eq "M") { + $gender1 = "m"; + } elsif ($gender1 eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($gender2 eq "M") { + $gender2 = "m"; + } elsif ($gender2 eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$wavs{$wav}") { + $uttId = $spkr1 ."_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wavs{$wav} |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wavs{$wav} |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $wavs{$wav} for $wav\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/make_swbd2_phase2.pl b/egs/multi_en/s5/local/make_swbd2_phase2.pl new file mode 100755 index 00000000000..337ab9d9708 --- /dev/null +++ b/egs/multi_en/s5/local/make_swbd2_phase2.pl @@ -0,0 +1,107 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2013 Daniel Povey +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC99S79 data/swbd2_phase2_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/DISC1/doc/callstat.tbl") || die "Could not open $db_base/DISC1/doc/callstat.tbl"; +open(CI, "<$db_base/DISC1/doc/callinfo.tbl") || die "Could not open $db_base/DISC1/doc/callinfo.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio = ("3", "4"); + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} + +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; + +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $uttId=$t1[0]; + $wav{$uttId} = $sph; +} + +while () { + $line = $_ ; + $ci = ; + $ci = ; + @ci = split(",",$ci); + $wav = $ci[0]; + @A = split(",", $line); + if (/$wav/i ~~ @badAudio) { + # do nothing + } else { + $spkr1= "sw_" . $A[2]; + $spkr2= "sw_" . $A[3]; + $gender1 = $A[4]; + $gender2 = $A[5]; + if ($gender1 eq "M") { + $gender1 = "m"; + } elsif ($gender1 eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($gender2 eq "M") { + $gender2 = "m"; + } elsif ($gender2 eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$wav{$wav}") { + $uttId = $spkr1 ."_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wav{$wav} |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wav{$wav} |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $wav{$wav} for $wav\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/make_swbd2_phase3.pl b/egs/multi_en/s5/local/make_swbd2_phase3.pl new file mode 100755 index 00000000000..f27853415a0 --- /dev/null +++ b/egs/multi_en/s5/local/make_swbd2_phase3.pl @@ -0,0 +1,102 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2013 Daniel Povey +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2002S06 data/swbd2_phase3_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/DISC1/docs/callstat.tbl") || die "Could not open $db_base/DISC1/docs/callstat.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio = ("3", "4"); + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} + +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $uttId=$t1[0]; + $wav{$uttId} = $sph; +} + +while () { + $line = $_ ; + @A = split(",", $line); + $wav = "sw_" . $A[0] ; + if (/$wav/i ~~ @badAudio) { + # do nothing + } else { + $spkr1= "sw_" . $A[3]; + $spkr2= "sw_" . $A[4]; + $gender1 = $A[5]; + $gender2 = $A[6]; + if ($gender1 eq "M") { + $gender1 = "m"; + } elsif ($gender1 eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($gender2 eq "M") { + $gender2 = "m"; + } elsif ($gender2 eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$wav{$wav}") { + $uttId = $spkr1 ."_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wav{$wav} |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wav{$wav} |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $wav{$wav} for $wav\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/make_swbd_cellular1.pl b/egs/multi_en/s5/local/make_swbd_cellular1.pl new file mode 100755 index 00000000000..e30c710e6fa --- /dev/null +++ b/egs/multi_en/s5/local/make_swbd_cellular1.pl @@ -0,0 +1,83 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2013 Daniel Povey +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2001S13 data/swbd_cellular1_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/doc/swb_callstats.tbl") || die "Could not open $db_base/doc/swb_callstats.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio = ("40019", "45024", "40022"); + +while () { + $line = $_ ; + @A = split(",", $line); + if (/$A[0]/i ~~ @badAudio) { + # do nothing + } else { + $wav = "sw_" . $A[0]; + $spkr1= "sw_" . $A[1]; + $spkr2= "sw_" . $A[2]; + $gender1 = $A[3]; + $gender2 = $A[4]; + if ($A[3] eq "M") { + $gender1 = "m"; + } elsif ($A[3] eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($A[4] eq "M") { + $gender2 = "m"; + } elsif ($A[4] eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$db_base/$wav.sph") { + $uttId = $spkr1 . "-swbdc_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $db_base/$wav.sph |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "-swbdc_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $db_base/$wav.sph |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $db_base/$wav.sph\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/make_swbd_cellular2.pl b/egs/multi_en/s5/local/make_swbd_cellular2.pl new file mode 100755 index 00000000000..4de954c194c --- /dev/null +++ b/egs/multi_en/s5/local/make_swbd_cellular2.pl @@ -0,0 +1,83 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2013 Daniel Povey +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2004S07 data/swbd_cellular2_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/docs/swb_callstats.tbl") || die "Could not open $db_base/docs/swb_callstats.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio=("45024", "40022"); + +while () { + $line = $_ ; + @A = split(",", $line); + if (/$A[0]/i ~~ @badAudio) { + # do nothing + } else { + $wav = "sw_" . $A[0]; + $spkr1= "sw_" . $A[1]; + $spkr2= "sw_" . $A[2]; + $gender1 = $A[3]; + $gender2 = $A[4]; + if ($A[3] eq "M") { + $gender1 = "m"; + } elsif ($A[3] eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($A[4] eq "M") { + $gender2 = "m"; + } elsif ($A[4] eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$db_base/data/$wav.sph") { + $uttId = $spkr1 . "-swbdc_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $db_base/data/$wav.sph |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "-swbdc_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $db_base/data/$wav.sph |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $db_base/data/$wav.sph\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_a.sh b/egs/multi_en/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_a.sh new file mode 100755 index 00000000000..947465dd9a1 --- /dev/null +++ b/egs/multi_en/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_a.sh @@ -0,0 +1,473 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=400 +test_nj=50 + +# The following 3 options decide the output directory for semi-supervised +# chain system +# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix} +multi=multi_a +chain_affix= +tdnn_affix=_semisup_5b + +# Data directories +supervised_data=data/multi_a/tri5a +unsupervised_data=data/train_mixer6_1a_seg + +# Input seed system +sup_gmm=tri5a +sup_chain_dir=exp/multi_a/chain/tdnn_5b_sp +sup_lat_dir=exp/multi_a/tri5a_lats_nodup_sp +sup_tree_dir=exp/multi_a/chain/tri5a_tree +sup_ivector_dir=exp/multi_a/nnet3/ivectors_multi_a/tri5a_sp +sup_ivector_root_dir=exp/multi_a/nnet3 + +train_new_ivector=false +nnet3_affix= # affix for nnet3 -- relates to i-vector used + # Applicable if training a new i-vector extractor + +# Unsupervised options +unsup_decode_opts="--frames-per-chunk 160 --extra-left-context 0 --extra-right-context 0" +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +supervision_weights=1.0,1.0 +lm_weights=3,1 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1536 +hidden_dim_l=1792 +bottleneck_dim=320 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +initial_effective_lrate=0.0005 +final_effective_lrate=0.00005 +max_param_change=2.0 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +decode_iter= +decode_dir_affix= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +exp_root=exp/$multi + +RANDOM=0 + +#if ! cuda-compiled; then +# cat </dev/null || true +utils/fix_data_dir.sh ${unsupervised_data}_sp_hires || exit 1 + +unsupervised_set=$(basename $unsupervised_data) +if [ $stage -le 2 ]; then + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 \ + ${unsupervised_data}_sp_hires ${unsupervised_data}_sp_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ + ${unsupervised_data}_sp_max2_hires $sup_ivector_root_dir/extractor \ + $sup_ivector_root_dir/ivectors_${unsupervised_set}_sp_hires || exit 1 +fi + +split_nj=400 +if [ $stage -le 5 ]; then + echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $sup_chain_dir" + steps/nnet3/decode_semisup.sh --num-threads 4 --sub-split $nj --nj $split_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \ + --online-ivector-dir $sup_ivector_root_dir/ivectors_${unsupervised_set}_sp_hires \ + $unsup_decode_opts --keep-subsplit true \ + --scoring-opts "--min-lmwt 10 --max-lmwt 10" --word-determinize false \ + $graphdir ${unsupervised_data}_sp_hires $sup_chain_dir/decode${decode_affix}_${unsupervised_set}_sp || exit 1 +fi + +if [ $stage -le 6 ]; then + steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ + --scoring-opts "--min-lmwt 10 --max-lmwt 10" --skip-scoring true \ + --write-compact true --acwt 0.1 --beam 8.0 --keep-subsplit true \ + $unsup_decode_lang $unsup_rescore_lang \ + ${unsupervised_data}_sp_hires \ + $sup_chain_dir/decode${decode_affix}_${unsupervised_set}_sp \ + $sup_chain_dir/decode${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix} || exit 1 +fi + +ln -sf ../final.mdl $sup_chain_dir/decode${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix}/ || true + +frame_subsampling_factor=1 +if [ -f $sup_chain_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $sup_chain_dir/frame_subsampling_factor` +fi + +if [ $stage -le 7 ]; then + steps/nnet3/merge_subsplit_lattices.sh \ + --cmd "${train_cmd}" --skip-scoring true --skip-diagnostics true \ + $unsup_decode_lang \ + ${unsupervised_data}_sp_hires \ + $sup_chain_dir/decode${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix} || exit 1 +fi + +unsup_lat_dir=${sup_chain_dir}/decode${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix} + + +if [ $stage -le 8 ]; then + steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ + ${unsupervised_data}_sp_hires \ + $sup_chain_dir/decode${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix} \ + $sup_chain_dir/best_path${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix} || exit 1 +fi +echo $frame_subsampling_factor > $sup_chain_dir/best_path${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix}/frame_subsampling_factor + +cmvn_opts=`cat $sup_chain_dir/cmvn_opts` || exit 1 + +if [ ! -f $sup_tree_dir/final.mdl ]; then + echo "$0: $sup_tree_dir/final.mdl does not exist." + exit 1 +fi + +diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree and $sup_chain_dir/tree differ"; exit 1; } + +dir=$exp_root/chain${chain_affix}/tdnn${tdnn_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $sup_chain_dir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $sup_chain_dir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $sup_chain_dir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${sup_tree_dir} ${sup_chain_dir}/best_path${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.0015 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + linear_opts="l2-regularize=0.0015 orthonormal-constraint=-1.0" + output_opts="l2-regularize=0.001" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=$hidden_dim + linear-component name=tdnn2l0 dim=$bottleneck_dim $linear_opts input=Append(-1,0) + linear-component name=tdnn2l dim=$bottleneck_dim $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=$hidden_dim + linear-component name=tdnn3l dim=$bottleneck_dim $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=$hidden_dim input=Append(0,1) + linear-component name=tdnn4l0 dim=$bottleneck_dim $linear_opts input=Append(-1,0) + linear-component name=tdnn4l dim=$bottleneck_dim $linear_opts input=Append(0,1) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=$hidden_dim + linear-component name=tdnn5l dim=$bottleneck_dim $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=$hidden_dim input=Append(0, tdnn3l) + linear-component name=tdnn6l0 dim=$bottleneck_dim $linear_opts input=Append(-3,0) + linear-component name=tdnn6l dim=$bottleneck_dim $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=$hidden_dim_l + linear-component name=tdnn7l0 dim=$bottleneck_dim $linear_opts input=Append(-3,0) + linear-component name=tdnn7l dim=$bottleneck_dim $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=$hidden_dim + linear-component name=tdnn8l0 dim=$bottleneck_dim $linear_opts input=Append(-3,0) + linear-component name=tdnn8l dim=$bottleneck_dim $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=$hidden_dim_l + linear-component name=tdnn9l0 dim=$bottleneck_dim $linear_opts input=Append(-3,0) + linear-component name=tdnn9l dim=$bottleneck_dim $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=$hidden_dim + linear-component name=tdnn10l0 dim=$bottleneck_dim $linear_opts input=Append(-3,0) + linear-component name=tdnn10l dim=$bottleneck_dim $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=$hidden_dim_l + linear-component name=tdnn11l0 dim=$bottleneck_dim $linear_opts input=Append(-3,0) + linear-component name=tdnn11l dim=$bottleneck_dim $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=$hidden_dim + linear-component name=prefinal-l dim=$bottleneck_dim $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=$hidden_dim_l + linear-component name=prefinal-chain-l dim=$bottleneck_dim $linear_opts + batchnorm-component name=prefinal-chain-batchnorm + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=$hidden_dim_l + linear-component name=prefinal-xent-l dim=$bottleneck_dim $linear_opts + batchnorm-component name=prefinal-xent-batchnorm + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts + + output name=output-0 input=output.affine + output name=output-1 input=output.affine + + output name=output-0-xent input=output-xent.log-softmax + output name=output-1-xent input=output-xent.log-softmax +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +egs_left_context=`perl -e "print int($model_left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($model_right_context + $frame_subsampling_factor / 2)"` + +supervised_set=$(basename $supervised_data) +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set}_sp + frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $sup_ivector_dir \ + --generate-egs-scp true --constrained false \ + ${supervised_data}_sp_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}_sp + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $sup_chain_dir/best_path${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix}/weights.scp \ + --online-ivector-dir $sup_ivector_root_dir/ivectors_${unsupervised_set}_sp_hires \ + --generate-egs-scp true --constrained false $unsup_egs_opts \ + ${unsupervised_data}_sp_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/comb_egs + +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $sup_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$comb_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs false \ + --feat-dir ${supervised_data}_sp_hires \ + --tree-dir $sup_tree_dir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + + for decode_set in rt03 eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $sup_ivector_root_dir/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if [ ! -z "$test_rescore_lang" ]; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + $test_lang $test_rescore_lang data/${decode_set}_hires \ + $dir/decode_${decode_set}${test_graph_affix} \ + $dir/decode_${decode_set}${test_rescore_graph_affix} || exit 1; + fi + ) & + done +fi + +test_online_decoding=true +lang=data/lang_${multi}_${gmm}_fsh_sw1_tg +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang $sup_ivector_root_dir/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in rt03 eval2000; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${test_graph_affix} || exit 1; + if $rescore; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + $test_lang $test_rescore_lang data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${test_graph_affix} \ + ${dir}_online/decode_${decode_set}${test_rescore_graph_affix} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in online decoding" + exit 1 + fi +fi + +exit 0; diff --git a/egs/multi_en/s5/local/semisup/run_mixer6.sh b/egs/multi_en/s5/local/semisup/run_mixer6.sh new file mode 100755 index 00000000000..6bdb74eb9bb --- /dev/null +++ b/egs/multi_en/s5/local/semisup/run_mixer6.sh @@ -0,0 +1,255 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +# This script demonstrates semi-supervised training using 50 hours of +# supervised data and 250 hours of unsupervised data. +# We assume the supervised data is in data/train_sup and unsupervised data +# is in data/train_unsup100k_250k. +# For LM training, we assume there is data/train/text, from which +# we will exclude the utterances contained in the unsupervised set. +# We use all 300 hours of semi-supervised data for i-vector extractor training. + +# This differs from run_100k.sh, which uses only 100 hours supervised data for +# both i-vector extractor training and LM training. + +. ./cmd.sh +. ./path.sh + +set -o pipefail +exp_root=exp/multi_a +supervised_data=data/multi_a/tri5b +stage=0 + +. utils/parse_options.sh + +if [ $stage -le 0 ]; then + local/mixer6_calls_prepare_data.py /export/LDC/LDC2013S03/mx6_speech data/local/mixer6 + local/mixer6_format_data.sh data/local/mixer6 data/train_mixer6 +fi + +mkdir -p sad_model + +if [ $stage -le 1 ]; then + ( + cd sad_model + wget http://kaldi-asr.org/models/0004_tdnn_stats_asr_sad_1a.tar.gz + tar -xzvf 0004_tdnn_stats_asr_sad_1a.tar.gz + ) +fi + +if [ $stage -le 2 ]; then + steps/segmentation/detect_speech_activity.sh --stage $sad_stage \ + --cmd "$train_cmd" --nj 400 --convert-data-dir-to-whole false \ + --extra-left-context 79 --extra-right-context 21 \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 150 --mfcc-config sad_model/conf/mfcc_hires.conf \ + data/train_mixer6 sad_model/exp/segmentation_1a/tdnn_stats_asr_sad_1a \ + sad_model/mfcc_hires sad_model/exp/segmentation_1a/tdnn_stats_asr_sad_1a \ + data/train_mixer6_1a +fi + +for f in data/train_mixer6_1a_seg/utt2spk \ + data/train_mixer6_1a_seg/feats.scp; do + if [ ! -f $f ]; then + echo "$0: Could not find $f" + exit 1 + fi +done + +############################################################################### +# Prepare the 50 hours supervised set and subsets for initial GMM training +############################################################################### + +if [ $stage -le 0 ]; then + utils/subset_data_dir.sh --speakers data/train_sup 50000 data/train_sup50k || exit 1 + utils/subset_data_dir.sh --shortest data/train_sup50k 25000 data/train_sup50k_short || exit 1 + utils/subset_data_dir.sh --speakers data/train_sup50k 30000 data/train_sup50k_30k || exit 1; +fi + +############################################################################### +# GMM system training using 50 hours supervised data +############################################################################### + +if [ $stage -le 1 ]; then + steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup50k_short data/lang $exp_root/mono0a || exit 1 +fi + +if [ $stage -le 2 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k_30k data/lang $exp_root/mono0a $exp_root/mono0a_ali || exit 1 + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train_sup50k_30k data/lang $exp_root/mono0a_ali $exp_root/tri1 || exit 1 + + (utils/mkgraph.sh data/lang_test $exp_root/tri1 $exp_root/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri1/graph data/dev $exp_root/tri1/decode_dev)& +fi + +if [ $stage -le 3 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k_30k data/lang $exp_root/tri1 $exp_root/tri1_ali || exit 1; + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train_sup50k_30k data/lang $exp_root/tri1_ali $exp_root/tri2 || exit 1 + + (utils/mkgraph.sh data/lang_test $exp_root/tri2 $exp_root/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri2/graph data/dev $exp_root/tri2/decode_dev)& +fi + +if [ $stage -le 4 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k data/lang $exp_root/tri2 $exp_root/tri2_ali || exit 1; + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 4000 30000 data/train_sup50k data/lang $exp_root/tri2_ali $exp_root/tri3a || exit 1; + + (utils/mkgraph.sh data/lang_test $exp_root/tri3a $exp_root/tri3a/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri3a/graph data/dev $exp_root/tri3a/decode_dev)& +fi + +if [ $stage -le 5 ]; then + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k data/lang $exp_root/tri3a $exp_root/tri3a_ali || exit 1; + + steps/train_sat.sh --cmd "$train_cmd" \ + 4000 50000 data/train_sup50k data/lang $exp_root/tri3a_ali $exp_root/tri4a || exit 1; + + ( + utils/mkgraph.sh data/lang_test $exp_root/tri4a $exp_root/tri4a/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri4a/graph data/dev $exp_root/tri4a/decode_dev + )& +fi + +############################################################################### +# Prepare semi-supervised train set +############################################################################### + +if [ $stage -le 6 ]; then + utils/combine_data.sh data/semisup50k_100k_250k \ + data/train_sup50k data/train_unsup100k_250k || exit 1 +fi + +############################################################################### +# Train LM on all the text in data/train/text, but excluding the +# utterances in the unsupervised set +############################################################################### + +if [ $stage -le 7 ]; then + mkdir -p data/local/pocolm_ex250k + + utils/filter_scp.pl --exclude data/train_unsup100k_250k/utt2spk \ + data/train/text > data/local/pocolm_ex250k/text.tmp + + if [ ! -f data/lang_test_poco_ex250k_big/G.carpa ]; then + local/fisher_train_lms_pocolm.sh \ + --text data/local/pocolm_ex250k/text.tmp \ + --dir data/local/pocolm_ex250k + + local/fisher_create_test_lang.sh \ + --arpa-lm data/local/pocolm_ex250k/data/arpa/4gram_small.arpa.gz \ + --dir data/lang_test_poco_ex250k + + utils/build_const_arpa_lm.sh \ + data/local/pocolm_ex250k/data/arpa/4gram_big.arpa.gz \ + data/lang_test_poco_ex250k data/lang_test_poco_ex250k_big + fi +fi + +############################################################################### +# Prepare lang directories with UNK modeled using phone LM +############################################################################### + +if [ $stage -le 8 ]; then + local/run_unk_model.sh || exit 1 + + for lang_dir in data/lang_test_poco_ex250k; do + rm -r ${lang_dir}_unk ${lang_dir}_unk_big 2>/dev/null || true + cp -rT data/lang_unk ${lang_dir}_unk + cp ${lang_dir}/G.fst ${lang_dir}_unk/G.fst + cp -rT data/lang_unk ${lang_dir}_unk_big + cp ${lang_dir}_big/G.carpa ${lang_dir}_unk_big/G.carpa; + done +fi + +############################################################################### +# Train seed chain system using 50 hours supervised data. +# Here we train i-vector extractor on combined supervised and unsupervised data +############################################################################### + +if [ $stage -le 9 ]; then + local/semisup/chain/run_tdnn.sh \ + --train-set train_sup50k \ + --ivector-train-set semisup50k_100k_250k \ + --nnet3-affix _semi50k_100k_250k \ + --chain-affix _semi50k_100k_250k \ + --tdnn-affix _1a --tree-affix bi_a \ + --gmm tri4a --exp-root $exp_root || exit 1 + + # WER on dev 21.41 + # WER on test 21.03 + # Final train prob -0.1035 + # Final valid prob -0.1667 + # Final train prob (xent) -1.5926 + # Final valid prob (xent) -1.7990 +fi + +############################################################################### +# Semi-supervised training using 50 hours supervised data and +# 250 hours unsupervised data. We use i-vector extractor, tree, lattices +# and seed chain system from the previous stage. +############################################################################### + +if [ $stage -le 10 ]; then + local/semisup/chain/run_tdnn_50k_semisupervised.sh \ + --supervised-set train_sup50k \ + --unsupervised-set train_unsup100k_250k \ + --sup-chain-dir $exp_root/chain_semi50k_100k_250k/tdnn_1a_sp \ + --sup-lat-dir $exp_root/chain_semi50k_100k_250k/tri4a_train_sup50k_sp_unk_lats \ + --sup-tree-dir $exp_root/chain_semi50k_100k_250k/tree_bi_a \ + --ivector-root-dir $exp_root/nnet3_semi50k_100k_250k \ + --chain-affix _semi50k_100k_250k \ + --tdnn-affix _semisup_1a \ + --exp-root $exp_root || exit 1 + + # WER on dev 18.98 + # WER on test 18.85 + # Final output-0 train prob -0.1381 + # Final output-0 valid prob -0.1723 + # Final output-0 train prob (xent) -1.3676 + # Final output-0 valid prob (xent) -1.4589 + # Final output-1 train prob -0.7671 + # Final output-1 valid prob -0.7714 + # Final output-1 train prob (xent) -1.1480 + # Final output-1 valid prob (xent) -1.2382 +fi + +############################################################################### +# Oracle system trained on combined 300 hours including both supervised and +# unsupervised sets. We use i-vector extractor, tree, and GMM trained +# on only the supervised for fair comparison to semi-supervised experiments. +############################################################################### + +if [ $stage -le 11 ]; then + local/semisup/chain/run_tdnn.sh \ + --train-set semisup50k_100k_250k \ + --nnet3-affix _semi50k_100k_250k \ + --chain-affix _semi50k_100k_250k \ + --common-treedir $exp_root/chain_semi50k_100k_250k/tree_bi_a \ + --tdnn-affix 1a_oracle --nj 100 \ + --gmm tri4a --exp-root $exp_root \ + --stage 9 || exit 1 + + # WER on dev 17.55 + # WER on test 17.72 + # Final output train prob -0.1155 + # Final output valid prob -0.1510 + # Final output train prob (xent) -1.7458 + # Final output valid prob (xent) -1.9045 +fi diff --git a/egs/multi_en/s5/local/semisup/run_semisup.sh b/egs/multi_en/s5/local/semisup/run_semisup.sh new file mode 100644 index 00000000000..1b1ac29da62 --- /dev/null +++ b/egs/multi_en/s5/local/semisup/run_semisup.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +stage=0 +nj=800 +sad_stage=5 + +. utils/parse_options.sh + +. ./path.sh +. ./cmd.sh + +if [ $stage -le 0 ]; then + # Prepare SWBD corpora. + local/make_swbd2_phase1.pl /export/LDC/LDC98S75 \ + data/swbd2_phase1_train + local/make_swbd2_phase2.pl /export/LDC/LDC99S79 \ + data/swbd2_phase2_train + local/make_swbd2_phase3.pl /export/LDC/LDC2002S06 \ + data/swbd2_phase3_train + local/make_swbd_cellular1.pl /export/LDC/LDC2001S13 \ + data/swbd_cellular1_train + local/make_swbd_cellular2.pl /export/LDC/LDC2004S07 \ + data/swbd_cellular2_train +fi + +if [ $stage -le 1 ]; then + mkdir -p data/local/sre + wget -P data/local/sre http://www.openslr.org/resources/15/speaker_list.tgz + tar -C data/local/sre -xvf data/local/sre/speaker_list.tgz + sre_ref=data/local/sre/speaker_list + + local/make_sre.pl /export/LDC/LDC2006S44 04 \ + data/local/sre/speaker_list data/sre2004 +fi + +if [ $stage -le 2 ]; then + local/make_mx6_calls.pl /export/LDC/LDC2013S03 data/local/mx6 + + for mic in 02 04 05 06 07 08 09 10 11 12 13; do + local/make_mx6_mic.pl /export/LDC/LDC2013S03 $mic data/local/mx6 + done + + utils/combine_data.sh data/local/mx6/mx6_mic_04_to_13 \ + data/local/mx6/mx6_mic_{04,05,06,07,08,09,10,11,12,13} +fi + +if [ $stage -le 3 ]; then + utils/data/get_reco2dur.sh \ + --read-entire-file true --cmd "$train_cmd" --nj 32 --permissive true \ + data/local/mx6/mx6_mic_04_to_13 + + utils/copy_data_dir.sh data/local/mx6/mx6_mic_04_to_13 \ + data/local/mx6/mx6_mic_04_to_13_filtered + + utils/filter_scp.pl data/local/mx6/mx6_mic_04_to_13/reco2dur \ + data/local/mx6/mx6_mic_04_to_13/wav.scp > data/local/mx6_mic_04_to_13_filtered/wav.scp + + utils/fix_data_dir.sh \ + data/local/mx6/mx6_mic_04_to_13_filtered + + utils/subset_data_dir.sh \ + data/local/mx6/mx6_mic_04_to_13_filtered 2000 \ + data/local/mx6/mx6_mic_04_to_13_2k +fi + +if [ $stage -le 4 ]; then + utils/combine_data.sh data/mx6_mic \ + data/local/mx6/mx6_mic_02 data/local/mx6/mx6_mic_04_to_13_2k + + utils/copy_data_dir.sh data/local/mx6/mx6_calls data/mx6_calls +fi + +if [ $stage -le 5 ]; then + utils/combine_data.sh data/train_semisup \ + data/swbd2_phase1_train \ + data/swbd2_phase2_train \ + data/swbd2_phase3_train \ + data/swbd_cellular1_train \ + data/swbd_cellular2_train \ + data/sre2004 data/mx6_calls data/mx6_mic +fi + +mkdir -p sad_model +if [ $stage -le 6 ]; then + ( + cd sad_model + wget http://kaldi-asr.org/models/0004_tdnn_stats_asr_sad_1a.tar.gz + tar -xzvf 0004_tdnn_stats_asr_sad_1a.tar.gz + ) +fi + +if [ $stage -le 7 ]; then + steps/segmentation/detect_speech_activity.sh --stage $sad_stage \ + --cmd "$train_cmd" --nj $nj --convert-data-dir-to-whole true \ + --extra-left-context 79 --extra-right-context 21 \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 150 --mfcc-config sad_model/conf/mfcc_hires.conf \ + data/train_semisup sad_model/exp/segmentation_1a/tdnn_stats_asr_sad_1a \ + sad_model/mfcc_hires sad_model/exp/segmentation_1a/tdnn_stats_asr_sad_1a \ + data/train_semisup_1a +fi + +for f in data/train_semisup_1a_seg/{utt2spk,feats.scp}; do + if [ ! -f $f ]; then + echo "$0: Could not find $f" + exit 1 + fi +done diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh index 6b6c08e779a..d6771bf971d 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -167,7 +167,6 @@ if [ $stage -le 8 ]; then --feat.online-ivector-dir "$ivector_dir" \ --chain.xent-regularize $xent_regularize \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ --chain.leaky-hmm-coefficient 0.1 \ --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ diff --git a/egs/rm/s5/local/online/run_nnet2_common.sh b/egs/rm/s5/local/online/run_nnet2_common.sh index e0034ddd7d2..eb9b0320333 100755 --- a/egs/rm/s5/local/online/run_nnet2_common.sh +++ b/egs/rm/s5/local/online/run_nnet2_common.sh @@ -1,8 +1,6 @@ #!/bin/bash # This script extracts mfcc features using mfcc_config and trains ubm model and # ivector extractor and extracts ivector for train and test. -. ./cmd.sh - stage=1 nnet_affix=_online @@ -11,6 +9,7 @@ ivector_dim=50 mfcc_config=conf/mfcc_hires.conf use_ivector=true # If false, it skips training ivector extractor and # ivector extraction stages. + . ./cmd.sh . ./path.sh . ./utils/parse_options.sh diff --git a/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh b/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh new file mode 100755 index 00000000000..674b8745c42 --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh @@ -0,0 +1,181 @@ +#!/bin/bash + +set -e -o pipefail + +# e.g. try lm-scale: +# local/chain/run_semisupervised.sh --stage 1 --tdnn-affix _sup1a --egs-affix _lmwt1.0 --lattice-lm-scale 1.0 + + +# frames_per_eg 300 +# local/chain/run_semisupervised.sh --stage 1 --tdnn-affix _sup1d --unsup-frames-per-eg 300 --egs-affix _fpe300 + +stage=0 +nj=30 +decode_nj=30 +base_train_set=train_cleaned # the starting point train-set +base_gmm=tri3_cleaned # the starting point of training on the supervised data (no flat start for now) +semi_affix= # affix relating train-set splitting proportion + # (currently supervised 25%) and the base train set (currently _cleaned), etc. +tdnn_affix=_sup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# combination options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +comb_affix=_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +unsup_frames_per_eg= # if empty will be equal to the supervised model's config +unsup_egs_weight=1.0 +lattice_lm_scale=0.1 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam= # If supplied will prune the lattices prior to getting egs for unsupervised data +left_tolerance=2 +right_tolerance=2 +train_combined_opts="--num-epochs 5" + +# to tune: +# frames_per_eg for unsupervised + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +supervised_set=${base_train_set}_sup${semi_affix} +unsupervised_set=${base_train_set}_unsup${semi_affix} +gmm=${base_gmm}_semi${semi_affix} # the gmm to be supplied to chain/run_tdnn.sh +nnet3_affix=_cleaned_semi${semi_affix} # affix for nnet3 and chain dirs + +if ! cuda-compiled; then + cat < data/$supervised_set/supervised_uttlist || true + utils/shuffle_list.pl data/$base_train_set/feats.scp | cut -d' ' -f1 | \ + tail -$num_unsupervised_utts > data/$supervised_set/unsupervised_uttlist || true + utils/subset_data_dir.sh --utt-list data/$supervised_set/supervised_uttlist \ + data/$base_train_set data/$supervised_set || exit 1 + utils/subset_data_dir.sh --utt-list data/$supervised_set/unsupervised_uttlist \ + data/$base_train_set data/$unsupervised_set || exit 1 + utils/data/subset_data_dir.sh --utt-list data/$unsupervised_set/feats.scp \ + data/${base_train_set}_sp_hires data/${unsupervised_set}_hires +fi + +if [ $stage -le -3 ]; then + # align the supervised subset with the current cleaned gmm + if [ -f $gmm/ali.1.gz ]; then + echo "$0: alignments in $gmm appear to already exist. Please either remove them " + echo " ... or use a later --stage option." + exit 1 + fi + echo "$0: aligning the supervised data data/${supervised_set}" + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + data/${supervised_set} data/lang exp/$base_gmm exp/$gmm +fi + +if [ $stage -le -2 ]; then + echo "$0: chain training on the supervised subset data/${supervised_set}" + local/chain/run_tdnn.sh $train_supervised_opts --remove-egs false \ + --train-set $supervised_set --gmm $gmm \ + --nnet3-affix $nnet3_affix --tdnn-affix $tdnn_affix +fi + +if [ $stage -le -1 ]; then + echo "$0: getting ivectors for the hires unsupervised data data/${unsupervised_set}_hires" + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \ + data/${unsupervised_set}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires +fi + +chaindir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi + +left_context=`cat $chaindir/egs/info/left_context` +right_context=`cat $chaindir/egs/info/right_context` +left_context_initial=`cat $chaindir/egs/info/left_context_initial` +right_context_final=`cat $chaindir/egs/info/right_context_final` +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` +frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +cmvn_opts=`cat $chaindir/cmvn_opts` + +if [ $stage -le 0 ]; then + echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir" + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $chaindir/graph data/${unsupervised_set}_hires $chaindir/decode_${unsupervised_set}${decode_affix} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${unsupervised_set}_hires \ + ${chaindir}/decode_${unsupervised_set}${decode_affix} ${chaindir}/decode_${unsupervised_set}${decode_affix}_rescore + ln -s ../final.mdl $chaindir/decode_${unsupervised_set}${decode_affix}_rescore/final.mdl || true +fi + +if [ $stage -le 1 ]; then + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $left_tolerance --right-tolerance $right_tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --egs-weight $unsup_egs_weight \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + data/${unsupervised_set}_hires $chaindir \ + ${chaindir}/decode_${unsupervised_set}${decode_affix}_rescore $chaindir/unsup_egs${decode_affix}${egs_affix} +fi + +sup_egs_dir=$chaindir/egs +unsup_egs_dir=$chaindir/unsup_egs${decode_affix}${egs_affix} +comb_egs_dir=$chaindir/comb_egs${decode_affix}${egs_affix} +if [ $stage -le 2 ]; then + echo "$0: combining supervised/unsupervised egs" + num_archives=`cat $chaindir/egs/info/num_archives` + mkdir -p $comb_egs_dir/log + cp {$sup_egs_dir,$comb_egs_dir}/train_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/valid_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/combine.cegs + cp {$sup_egs_dir,$comb_egs_dir}/cmvn_opts + cp -r $sup_egs_dir/info $comb_egs_dir + cat {$sup_egs_dir,$unsup_egs_dir}/info/num_frames | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/num_frames + cat {$sup_egs_dir,$unsup_egs_dir}/info/egs_per_archive | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/egs_per_archive + out_egs_list= + egs_list= + for n in $(seq $num_archives); do + egs_list="$egs_list $sup_egs_dir/cegs.$n.ark" + egs_list="$egs_list $unsup_egs_dir/cegs.$n.ark" + out_egs_list="$out_egs_list ark:$comb_egs_dir/cegs.$n.ark" + done + srand=0 + $decode_cmd $comb_egs_dir/log/combine.log \ + nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list +fi + +if [ $stage -le 3 ]; then + echo "$0: training on the supervised+unsupervised subset" + # the train-set and gmm do not matter as we are providing the egs + local/chain/run_tdnn.sh --stage 17 --remove-egs false --train-set $supervised_set --gmm $gmm \ + --nnet3-affix $nnet3_affix \ + --tdnn-affix ${tdnn_affix}${decode_affix}${egs_affix}${comb_affix} \ + --common-egs-dir $comb_egs_dir $train_combined_opts +fi + diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh index 4f86691b752..109168911d9 100755 --- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh @@ -52,6 +52,7 @@ train_set=train_cleaned gmm=tri3_cleaned # the gmm for the target data num_threads_ubm=32 nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +num_epochs=4 # The rest are configs specific to this script. Most of the parameters # are just hardcoded at this level, in the commands below. @@ -59,6 +60,7 @@ train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. tdnn_affix=1d #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir= # you can set this to use previously dumped egs. +remove_egs=true # End configuration section. echo "$0 $@" # Print the command line for logging @@ -212,13 +214,13 @@ if [ $stage -le 18 ]; then --egs.chunk-width 150 \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ + --trainer.num-epochs $num_epochs \ --trainer.optimization.num-jobs-initial 2 \ --trainer.optimization.num-jobs-final 12 \ --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ + --cleanup.remove-egs $remove_egs \ --feat-dir $train_data_dir \ --tree-dir $tree_dir \ --lat-dir $lat_dir \ diff --git a/egs/wsj/s5/steps/conf/apply_calibration.sh b/egs/wsj/s5/steps/conf/apply_calibration.sh index c1a22e274b8..48f9e17d30b 100755 --- a/egs/wsj/s5/steps/conf/apply_calibration.sh +++ b/egs/wsj/s5/steps/conf/apply_calibration.sh @@ -28,6 +28,7 @@ caldir=$4 dir=$5 model=$latdir/../final.mdl # assume model one level up from decoding dir. +model_dir=$latdir/.. calibration=$caldir/calibration.mdl word_feats=$caldir/word_feats word_categories=$caldir/word_categories @@ -49,6 +50,12 @@ cp $calibration $dir/calibration.mdl cp $word_feats $dir/word_feats cp $word_categories $dir/word_categories +frame_shift_opt= +if [ -f $model_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $model_dir/frame_subsampling_factor) + frame_shift_opt="--frame-shift=0.0$frame_subsampling_factor" +fi + # Create the ctm with raw confidences, # - we keep the timing relative to the utterance, if [ $stage -le 0 ]; then @@ -58,7 +65,7 @@ if [ $stage -le 0 ]; then lattice-push --push-strings=false ark:- ark:- \| \ lattice-align-words-lexicon --max-expand=10.0 \ $lang/phones/align_lexicon.int $model ark:- ark:- \| \ - lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \ + lattice-to-ctm-conf --decode-mbr=$decode_mbr $frame_shift_opt ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \ '>' $dir/JOB.ctm # Merge and clean, @@ -76,7 +83,7 @@ fi # Create the forwarding data for logistic regression, if [ $stage -le 2 ]; then steps/conf/prepare_calibration_data.py --conf-feats $dir/forward_feats.ark \ - --lattice-depth $latdepth $dir/ctm_int $word_feats $word_categories + --lattice-depth $latdepth $frame_shift_opt $dir/ctm_int $word_feats $word_categories fi # Apply calibration model to dev, diff --git a/egs/wsj/s5/steps/conf/convert_ctm_to_weights.py b/egs/wsj/s5/steps/conf/convert_ctm_to_weights.py new file mode 100755 index 00000000000..02a616b2c03 --- /dev/null +++ b/egs/wsj/s5/steps/conf/convert_ctm_to_weights.py @@ -0,0 +1,101 @@ +#! /usr/bin/env python + +import argparse +import logging +import sys + +sys.path.insert(0, 'steps') +import libs.common as common_lib + +logger = logging.getLogger('libs') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script converts CTM to per-frame weights by the word + posteriors in the CTM as the weights.""") + + parser.add_argument("--frame-shift", type=float, default=0.01, + help="Frame shift value in seconds") + parser.add_argument("--default-weight", type=float, default=1.0, + help="Default weight on silence frames") + parser.add_argument("segments_in", type=str, help="Input segments file") + parser.add_argument("ctm_in", type=str, help="Input utterance-level CTM " + "file i.e. the first column has utterance-ids") + parser.add_argument("weights_out", type=str, help="Output per-frame " + "weights vector written in Kaldi text archive format") + + args = parser.parse_args() + + return args + + +def run(args): + utt2num_frames = {} + with common_lib.smart_open(args.segments_in) as segments_reader: + for line in segments_reader.readlines(): + parts = line.strip().split() + if len(parts) not in [4, 5]: + raise RuntimeError("Invalid line {0} in segments file {1}" + "".format(line.strip(), args.segments_in)) + utt2num_frames[parts[0]] = int((float(parts[3]) - float(parts[2])) + / args.frame_shift + 0.5) + + num_utt = 0 + with common_lib.smart_open(args.ctm_in) as ctm_reader, \ + common_lib.smart_open(args.weights_out, 'w') as weights_writer: + prev_utt = None + weights = [] + for line in ctm_reader.readlines(): + parts = line.strip().split() + if len(parts) not in [5, 6]: + raise RuntimeError("Invalid line {0} in CTM file {1}" + "".format(line.strip(), args.ctm_in)) + + utt = parts[0] + if utt != prev_utt: + if prev_utt is not None: + assert len(weights) >= utt2num_frames[prev_utt] + common_lib.write_vector_ascii(weights_writer, weights, + key=prev_utt) + weights = [args.default_weight for x in + range(utt2num_frames[utt])] + + start_time = float(parts[2]) + dur = float(parts[3]) + prob = 1.0 if len(parts) == 5 else float(parts[5]) + + start_frame = int(start_time / args.frame_shift + 0.5) + length = int(dur / args.frame_shift) + + if len(weights) < start_frame + length: + weights.extend([args.default_weight for x in + range(len(weights), start_frame + length)]) + for x in range(start_frame, start_frame + length): + weights[x] = prob + + assert len(weights) >= start_frame + length + prev_utt = utt + num_utt += 1 + assert len(weights) >= utt2num_frames[prev_utt] + common_lib.write_vector_ascii(weights_writer, weights, + key=prev_utt) + + if num_utt == 0: + raise RuntimeError("Failed to process any utterances") + + +def main(): + args = get_args() + run(args) + + +if __name__ == "__main__": + main() diff --git a/egs/wsj/s5/steps/conf/prepare_calibration_data.py b/egs/wsj/s5/steps/conf/prepare_calibration_data.py index bc8f92a2f7f..753771b1d89 100755 --- a/egs/wsj/s5/steps/conf/prepare_calibration_data.py +++ b/egs/wsj/s5/steps/conf/prepare_calibration_data.py @@ -10,7 +10,7 @@ Prepare input features and training targets for logistic regression, which calibrates the Minimum Bayes Risk posterior confidences. -The logisitc-regression input features are: +The logisitc-regression input features are: - posteriors from 'ctm' transformed by logit, - logarithm of word-length in letters, - 10base logarithm of unigram probability of a word from language model, @@ -34,6 +34,8 @@ parser.add_option("--conf-targets", help="Targets file for logistic regression (no targets generated if '') [default %default]", default='') parser.add_option("--conf-feats", help="Feature file for logistic regression. [default %default]", default='') parser.add_option("--lattice-depth", help="Per-frame lattice depths, ascii-ark (optional). [default %default]", default='') +parser.add_option("--frame-shift", type=float, default=0.01, + help="Frame shift value in seconds [default %default]") (o, args) = parser.parse_args() if len(args) != 3: @@ -63,11 +65,11 @@ if o.conf_targets != '': with open(o.conf_targets,'w') as f: for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm: - # Skip the words we don't know if being correct, - if score_tag == 'U': continue + # Skip the words we don't know if being correct, + if score_tag == 'U': continue # Some words are excluded from training (partial words, hesitations, etc.), # (Value: 1 == keep word, 0 == exclude word from the targets), - if not word_filter[wrd_id]: continue + if not word_filter[wrd_id]: continue # Build the key, key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag) # Build the target, @@ -102,7 +104,7 @@ # - log of word-length, log_word_length = math.log(word_length[wrd_id]) # i.e. number of phones in a word, # - categorical distribution of words (with frequency higher than min-count), - wrd_1_of_k = [0]*wrd_cat_num; + wrd_1_of_k = [0]*wrd_cat_num; wrd_1_of_k[wrd_to_cat[wrd_id]] = 1; # Compose the input feature vector, @@ -110,10 +112,10 @@ # Optionally add average-depth of lattice at the word position, if o.lattice_depth != '': - depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))] + depth_slice = depths[utt][int(float(beg) / o.frame_shift + 0.5):int((float(beg) + max(o.frame_shift, float(dur))) / o.frame_shift + 0.5)] log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice)) feats += [ log_avg_depth ] - # Store the input features, + # Store the input features, f.write(key + ' [ ' + ' '.join(map(str,feats)) + ' ]\n') diff --git a/egs/wsj/s5/steps/conf/train_calibration.sh b/egs/wsj/s5/steps/conf/train_calibration.sh index c2aca05056e..9a8451c9f85 100755 --- a/egs/wsj/s5/steps/conf/train_calibration.sh +++ b/egs/wsj/s5/steps/conf/train_calibration.sh @@ -12,7 +12,7 @@ # (- categorical distribution of 'lang/words.txt', DISABLED) # begin configuration section. -cmd= +cmd=run.pl lmwt=12 decode_mbr=true word_min_count=10 # Minimum word-count for single-word category, @@ -43,6 +43,7 @@ latdir=$4 dir=$5 model=$latdir/../final.mdl # assume model one level up from decoding dir. +model_dir=$latdir/.. for f in $data/text $lang/words.txt $word_feats $latdir/lat.1.gz; do [ ! -f $f ] && echo "$0: Missing file $f" && exit 1 @@ -57,6 +58,12 @@ echo $lmwt >$dir/lmwt echo $decode_mbr >$dir/decode_mbr cp $word_feats $dir/word_feats +frame_shift_opt= +if [ -f $model_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $model_dir/frame_subsampling_factor) + frame_shift_opt="--frame-shift=0.0$frame_subsampling_factor" +fi + # Create the ctm with raw confidences, # - we keep the timing relative to the utterance, if [ $stage -le 0 ]; then @@ -66,7 +73,7 @@ if [ $stage -le 0 ]; then lattice-push --push-strings=false ark:- ark:- \| \ lattice-align-words-lexicon --max-expand=10.0 \ $lang/phones/align_lexicon.int $model ark:- ark:- \| \ - lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \ + lattice-to-ctm-conf --decode-mbr=$decode_mbr $frame_shift_opt ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \ '>' $dir/JOB.ctm # Merge and clean, @@ -104,7 +111,7 @@ fi if [ $stage -le 3 ]; then steps/conf/prepare_calibration_data.py \ --conf-targets $dir/train_targets.ark --conf-feats $dir/train_feats.ark \ - --lattice-depth $latdepth $dir/ctm_aligned_int $word_feats $dir/word_categories + --lattice-depth $latdepth $frame_shift_opt $dir/ctm_aligned_int $word_feats $dir/word_categories fi # Train the logistic regression, diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index 71e64d9e680..f6be7a286ec 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -413,13 +413,7 @@ def CreateReverberatedCopy(input_dir, wav_scp = ParseFileToDict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x)) if not os.path.isfile(input_dir + "/reco2dur"): print("Getting the duration of the recordings..."); - read_entire_file="false" - for value in wav_scp.values(): - # we will add more checks for sox commands which modify the header as we come across these cases in our data - if "sox" in value and "speed" in value: - read_entire_file="true" - break - data_lib.RunKaldiCommand("wav-to-duration --read-entire-file={1} scp:{0}/wav.scp ark,t:{0}/reco2dur".format(input_dir, read_entire_file)) + data_lib.RunKaldiCommand("utils/data/get_reco2dur.sh {}".format(input_dir)) durations = ParseFileToDict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0])) foreground_snr_array = map(lambda x: float(x), foreground_snr_string.split(':')) background_snr_array = map(lambda x: float(x), background_snr_string.split(':')) diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py index 1e8e2ced6ce..29143c275cb 100644 --- a/egs/wsj/s5/steps/libs/common.py +++ b/egs/wsj/s5/steps/libs/common.py @@ -386,6 +386,33 @@ def write_matrix_ascii(file_or_fd, mat, key=None): if fd is not file_or_fd : fd.close() +def write_vector_ascii(file_or_fd, vec, key=None): + """This function writes the vector 'vec' stored as a list + in kaldi vector text format. + The destination can be a file or an opened file descriptor. + If key is provided, then vector is written to an archive with the 'key' + as the index field. + """ + try: + fd = open(file_or_fd, 'w') + except TypeError: + # 'file_or_fd' is opened file descriptor, + fd = file_or_fd + + try: + if key is not None: + print ("{0} [".format(key), + file=fd, end=' ') # ark-files have keys (utterance-id) + else: + print (" [", file=fd, end=' ') + + line = ' '.join(["{0:f}".format(x) for x in vec]) + line += " ]" + print (line, file=fd) + finally: + if fd is not file_or_fd : fd.close() + + def read_matrix_ascii(file_or_fd): """This function reads a matrix in kaldi matrix text format and stores it as a list of lists. diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py index 63b1c12c759..56960f02a1c 100755 --- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py +++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py @@ -387,7 +387,9 @@ def get_train_times(exp_dir): train_times[iter] = max(values) return train_times -def parse_prob_logs(exp_dir, key='accuracy', output="output"): + +def parse_prob_logs(exp_dir, key='accuracy', output="output", + get_smbr_objf=False): train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir) valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir) train_prob_strings = common_lib.get_command_stdout( @@ -411,28 +413,53 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"): "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for " "'{output}'.*is ([0-9.\-e]+) .*per frame".format(output=output)) + other_objfs_str = "" + for i in range(2): + other_objfs_str += "[0-9.\-e]+ [+] "; + smbr_parse_regex = re.compile( + ".*compute_prob_.*\.([0-9]+).log:LOG " + ".nnet3.*compute-prob.*:PrintTotalStats..:" + "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for " + "'{output}'.*is {other_objfs}([0-9.\-e]+) .*per frame".format( + output=output, other_objfs=other_objfs_str)) + train_objf = {} valid_objf = {} for line in train_prob_strings.split('\n'): mat_obj = parse_regex.search(line) - if mat_obj is not None: + mmi_mat_obj = smbr_parse_regex.search(line) + + if mmi_mat_obj is not None: # This is SMBR training + groups = (mat_obj.groups() if get_smbr_objf + else mmi_mat_obj.groups()) + elif mat_obj is not None and not get_smbr_objf: # This is normal chain training groups = mat_obj.groups() - if groups[1] == key: - train_objf[int(groups[0])] = groups[2] + else: + continue + + if groups[1] == key: + train_objf[int(groups[0])] = groups[2] if not train_objf: - raise KaldiLogParseException("Could not find any lines with {k} in " + raise KaldiLogParseException("Could not find any values with {k} in " " {l}".format(k=key, l=train_prob_files)) for line in valid_prob_strings.split('\n'): mat_obj = parse_regex.search(line) - if mat_obj is not None: + mmi_mat_obj = smbr_parse_regex.search(line) + + if mmi_mat_obj is not None: # This is SMBR training + groups = (mat_obj.groups() if get_smbr_objf + else mmi_mat_obj.groups()) + elif mat_obj is not None: # This is normal chain training groups = mat_obj.groups() - if groups[1] == key: - valid_objf[int(groups[0])] = groups[2] + else: + continue + if groups[1] == key: + valid_objf[int(groups[0])] = groups[2] if not valid_objf: - raise KaldiLogParseException("Could not find any lines with {k} in " + raise KaldiLogParseException("Could not find any values at with {k} in " " {l}".format(k=key, l=valid_prob_files)) iters = list(set(valid_objf.keys()).intersection(train_objf.keys())) @@ -509,7 +536,8 @@ def parse_rnnlm_prob_logs(exp_dir, key='objf'): -def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): + +def generate_acc_logprob_report(exp_dir, key="accuracy", output="output", get_smbr_objf=False): try: times = get_train_times(exp_dir) except: @@ -523,7 +551,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): if key == "rnnlm_objective": data = list(parse_rnnlm_prob_logs(exp_dir, 'objf')) else: - data = list(parse_prob_logs(exp_dir, key, output)) + data = list(parse_prob_logs(exp_dir, key, output, get_smbr_objf)) except: tb = traceback.format_exc() logger.warning("Error getting info from logs, exception was: " + tb) @@ -532,7 +560,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): try: report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]), x[1], x[2], x[2]-x[1])) - except KeyError, IndexError: + except (KeyError, IndexError): continue total_time = 0 @@ -540,4 +568,4 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): total_time += times[iter] report.append("Total training time is {0}\n".format( str(datetime.timedelta(seconds=total_time)))) - return ["\n".join(report), times, data] \ No newline at end of file + return ["\n".join(report), times, data] diff --git a/egs/wsj/s5/steps/libs/nnet3/train/__init__.py b/egs/wsj/s5/steps/libs/nnet3/train/__init__.py index 0503c0135cd..9679634658d 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/__init__.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/__init__.py @@ -9,3 +9,5 @@ frame_level_objf -- For both recurrent and non-recurrent architectures chain_objf -- LF-MMI objective training """ + +from . import common, dropout_schedule diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 229f290e94c..54c3cf93fdc 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -69,14 +69,15 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir, alignment_subsampling_factor=3, online_ivector_dir=None, frames_per_iter=20000, frames_per_eg_str="20", srand=0, - egs_opts=None, cmvn_opts=None): + egs_opts=None, cmvn_opts=None, + get_egs_script="steps/nnet3/chain/get_egs.sh"): """Wrapper for steps/nnet3/chain/get_egs.sh See options in that script. """ common_lib.execute_command( - """steps/nnet3/chain/get_egs.sh {egs_opts} \ + """{get_egs_script} {egs_opts} \ --cmd "{command}" \ --cmvn-opts "{cmvn_opts}" \ --online-ivector-dir "{ivector_dir}" \ @@ -90,9 +91,10 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir, --alignment-subsampling-factor {alignment_subsampling_factor} \ --stage {stage} \ --frames-per-iter {frames_per_iter} \ - --frames-per-eg {frames_per_eg_str} \ + --frames-per-eg "{frames_per_eg_str}" \ --srand {srand} \ {data} {dir} {lat_dir} {egs_dir}""".format( + get_egs_script=get_egs_script, command=run_opts.egs_command, cmvn_opts=cmvn_opts if cmvn_opts is not None else '', ivector_dir=(online_ivector_dir @@ -121,12 +123,12 @@ def train_new_models(dir, iter, srand, num_jobs, raw_model_string, egs_dir, apply_deriv_weights, min_deriv_time, max_deriv_time_relative, - l2_regularize, xent_regularize, leaky_hmm_coefficient, + l2_regularize, xent_regularize, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch_str, - frame_subsampling_factor, run_opts, train_opts, + frame_subsampling_factor, truncate_deriv_weights, run_opts, train_opts, backstitch_training_scale=0.0, backstitch_training_interval=1, - use_multitask_egs=False): + use_multitask_egs=False, objective_opts=""): """ Called from train_one_iteration(), this method trains new models with 'num_jobs' jobs, and @@ -185,8 +187,8 @@ def train_new_models(dir, iter, srand, num_jobs, thread = common_lib.background_command( """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ nnet3-chain-train {parallel_train_opts} {verbose_opt} \ - --apply-deriv-weights={app_deriv_wts} \ - --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ + --apply-deriv-weights={app_deriv_wts} {objective_opts} \ + --l2-regularize={l2} \ {cache_io_opts} --xent-regularize={xent_reg} \ {deriv_time_opts} \ --print-interval=10 --momentum={momentum} \ @@ -208,10 +210,11 @@ def train_new_models(dir, iter, srand, num_jobs, dir=dir, iter=iter, srand=iter + srand, next_iter=iter + 1, job=job, deriv_time_opts=" ".join(deriv_time_opts), + trunc_deriv=truncate_deriv_weights, app_deriv_wts=apply_deriv_weights, fr_shft=frame_shift, l2=l2_regularize, train_opts=train_opts, - xent_reg=xent_regularize, leaky=leaky_hmm_coefficient, + xent_reg=xent_regularize, cache_io_opts=cache_io_opts, parallel_train_opts=run_opts.parallel_train_opts, verbose_opt=verbose_opt, @@ -224,7 +227,8 @@ def train_new_models(dir, iter, srand, num_jobs, buf_size=shuffle_buffer_size, num_chunk_per_mb=num_chunk_per_minibatch_str, multitask_egs_opts=multitask_egs_opts, - scp_or_ark=scp_or_ark), + scp_or_ark=scp_or_ark, + objective_opts=objective_opts), require_zero_status=True) threads.append(thread) @@ -240,12 +244,12 @@ def train_one_iteration(dir, iter, srand, egs_dir, apply_deriv_weights, min_deriv_time, max_deriv_time_relative, l2_regularize, xent_regularize, - leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, - frame_subsampling_factor, + frame_subsampling_factor, truncate_deriv_weights, run_opts, dropout_edit_string="", train_opts="", backstitch_training_scale=0.0, backstitch_training_interval=1, - use_multitask_egs=False): + use_multitask_egs=False, + objective_opts=""): """ Called from steps/nnet3/chain/train.py for one iteration for neural network training with LF-MMI objective @@ -275,8 +279,9 @@ def train_one_iteration(dir, iter, srand, egs_dir, compute_train_cv_probabilities( dir=dir, iter=iter, egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, - use_multitask_egs=use_multitask_egs) + run_opts=run_opts, + use_multitask_egs=use_multitask_egs, + objective_opts=objective_opts) if iter > 0: # Runs in the background @@ -302,6 +307,18 @@ def train_one_iteration(dir, iter, srand, egs_dir, cur_max_param_change = float(max_param_change) / math.sqrt(2) raw_model_string = raw_model_string + dropout_edit_string + + shrink_info_str = '' + if shrinkage_value != 1.0: + shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value) + + objf_info = "" if objective_opts == "" else ( + "and objective_opts=" + objective_opts) + logger.info("On iteration {0}, learning rate is {1}" + "{shrink_info} {objf_info}.".format( + iter, learning_rate, + shrink_info=shrink_info_str, objf_info=objf_info)) + train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, @@ -312,19 +329,20 @@ def train_one_iteration(dir, iter, srand, egs_dir, max_deriv_time_relative=max_deriv_time_relative, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=leaky_hmm_coefficient, momentum=momentum, max_param_change=cur_max_param_change, shuffle_buffer_size=shuffle_buffer_size, num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str, frame_subsampling_factor=frame_subsampling_factor, + truncate_deriv_weights=truncate_deriv_weights, run_opts=run_opts, train_opts=train_opts, # linearly increase backstitch_training_scale during the # first few iterations (hard-coded as 15) backstitch_training_scale=(backstitch_training_scale * iter / 15 if iter < 15 else backstitch_training_scale), backstitch_training_interval=backstitch_training_interval, - use_multitask_egs=use_multitask_egs) + use_multitask_egs=use_multitask_egs, + objective_opts=objective_opts) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) @@ -469,9 +487,10 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_model=None): def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, - xent_regularize, leaky_hmm_coefficient, + xent_regularize, run_opts, - use_multitask_egs=False): + use_multitask_egs=False, + objective_opts=""): model = '{0}/{1}.mdl'.format(dir, iter) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_suffix = ".scp" if use_multitask_egs else ".cegs" @@ -481,20 +500,24 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, egs_prefix="valid_diagnostic.", use_multitask_egs=use_multitask_egs) + import re + objective_opts = re.sub(r"--mmi-factor=0.0 ", "--mmi-factor=1e-10 ", + objective_opts) common_lib.background_command( """{command} {dir}/log/compute_prob_valid.{iter}.log \ - nnet3-chain-compute-prob --l2-regularize={l2} \ - --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ + nnet3-chain-compute-prob --l2-regularize={l2} {objective_opts} \ + --xent-regularize={xent_reg} \ {model} {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/valid_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ """.format(command=run_opts.command, dir=dir, iter=iter, model=model, - l2=l2_regularize, leaky=leaky_hmm_coefficient, + l2=l2_regularize, xent_reg=xent_regularize, egs_dir=egs_dir, multitask_egs_opts=multitask_egs_opts, - scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) + scp_or_ark=scp_or_ark, egs_suffix=egs_suffix, + objective_opts=objective_opts)) multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, @@ -503,17 +526,18 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, common_lib.background_command( """{command} {dir}/log/compute_prob_train.{iter}.log \ - nnet3-chain-compute-prob --l2-regularize={l2} \ - --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ + nnet3-chain-compute-prob --l2-regularize={l2} {objective_opts} \ + --xent-regularize={xent_reg} \ {model} {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/train_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ """.format(command=run_opts.command, dir=dir, iter=iter, model=model, - l2=l2_regularize, leaky=leaky_hmm_coefficient, + l2=l2_regularize, xent_reg=xent_regularize, egs_dir=egs_dir, multitask_egs_opts=multitask_egs_opts, - scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) + scp_or_ark=scp_or_ark, egs_suffix=egs_suffix, + objective_opts=objective_opts)) def compute_progress(dir, iter, run_opts): @@ -555,10 +579,11 @@ def compute_progress(dir, iter, run_opts): def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, - egs_dir, leaky_hmm_coefficient, l2_regularize, + egs_dir, l2_regularize, xent_regularize, run_opts, max_objective_evaluations=30, - use_multitask_egs=False): + use_multitask_egs=False, + objective_opts=""): """ Function to do model combination In the nnet3 setup, the logic @@ -599,9 +624,9 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st common_lib.execute_command( """{command} {combine_queue_opt} {dir}/log/combine.log \ - nnet3-chain-combine \ + nnet3-chain-combine {objective_opts} \ --max-objective-evaluations={max_objective_evaluations} \ - --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ + --l2-regularize={l2} \ --verbose=3 {combine_gpu_opt} {dir}/den.fst {raw_models} \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/combine{egs_suffix} ark:- | \ nnet3-chain-merge-egs --minibatch-size={num_chunk_per_mb} \ @@ -612,13 +637,14 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st combine_queue_opt=run_opts.combine_queue_opt, combine_gpu_opt=run_opts.combine_gpu_opt, max_objective_evaluations=max_objective_evaluations, - l2=l2_regularize, leaky=leaky_hmm_coefficient, + l2=l2_regularize, dir=dir, raw_models=" ".join(raw_model_strings), num_chunk_per_mb=num_chunk_per_minibatch_str, num_iters=num_iters, egs_dir=egs_dir, multitask_egs_opts=multitask_egs_opts, - scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) + scp_or_ark=scp_or_ark, egs_suffix=egs_suffix, + objective_opts=objective_opts)) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the @@ -626,6 +652,6 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st compute_train_cv_probabilities( dir=dir, iter='final', egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, - use_multitask_egs=use_multitask_egs) + use_multitask_egs=use_multitask_egs, + objective_opts=objective_opts) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/ts.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/ts.py new file mode 100644 index 00000000000..ff416657adf --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/ts.py @@ -0,0 +1,471 @@ + + +# Copyright 2016 Vijayaditya Peddinti. +# 2016 Vimal Manohar +# Apache 2.0. + +""" This is a module with methods which will be used by scripts for +teacher-student training of deep neural network acoustic model with +sequence KL objective. +""" + +import logging +import math +import os +import sys + +import libs.common as common_lib +import libs.nnet3.train.common as common_train_lib +from . import acoustic_model as chain_lib + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) + + +def generate_chain_egs(dir, data, lat_dir, egs_dir, + left_context, right_context, + run_opts, stage=0, + left_context_initial=-1, right_context_final=-1, + frame_subsampling_factor=3, + online_ivector_dir=None, + frames_per_iter=20000, frames_per_eg_str="20", srand=0, + egs_opts=None, cmvn_opts=None, transform_dir=None): + """Wrapper for steps/nnet3/chain/get_egs_ts.sh + + See options in that script. + """ + + common_lib.execute_command( + """steps/nnet3/chain/get_egs_ts.sh {egs_opts} \ + --cmd "{command}" \ + --cmvn-opts "{cmvn_opts}" \ + --transform-dir "{transform_dir}" \ + --online-ivector-dir "{ivector_dir}" \ + --left-context {left_context} \ + --right-context {right_context} \ + --left-context-initial {left_context_initial} \ + --right-context-final {right_context_final} \ + --frame-subsampling-factor {frame_subsampling_factor} \ + --stage {stage} \ + --frames-per-iter {frames_per_iter} \ + --frames-per-eg {frames_per_eg_str} \ + --srand {srand} \ + {data} {dir} {lat_dir} {egs_dir}""".format( + command=run_opts.egs_command, + cmvn_opts=cmvn_opts if cmvn_opts is not None else '', + transform_dir=(transform_dir + if transform_dir is not None + else ''), + ivector_dir=(online_ivector_dir + if online_ivector_dir is not None + else ''), + left_context=left_context, + right_context=right_context, + left_context_initial=left_context_initial, + right_context_final=right_context_final, + frame_subsampling_factor=frame_subsampling_factor, + stage=stage, frames_per_iter=frames_per_iter, + frames_per_eg_str=frames_per_eg_str, srand=srand, + data=data, lat_dir=lat_dir, dir=dir, egs_dir=egs_dir, + egs_opts=egs_opts if egs_opts is not None else '')) + + +#def train_new_models(dir, iter, srand, num_jobs, +# num_archives_processed, num_archives, +# raw_model_string, egs_dir, +# apply_deriv_weights, +# min_deriv_time, max_deriv_time_relative, +# l2_regularize, xent_regularize, leaky_hmm_coefficient, +# momentum, max_param_change, +# shuffle_buffer_size, num_chunk_per_minibatch_str, +# frame_subsampling_factor, truncate_deriv_weights, run_opts, +# backstitch_training_scale=0.0, backstitch_training_interval=1, +# use_multitask_egs=False, objective_opts=""): +# """ +# Called from train_one_iteration(), this method trains new models +# with 'num_jobs' jobs, and +# writes files like exp/tdnn_a/24.{1,2,3,..}.raw +# +# We cannot easily use a single parallel SGE job to do the main training, +# because the computation of which archive and which --frame option +# to use for each job is a little complex, so we spawn each one separately. +# this is no longer true for RNNs as we use do not use the --frame option +# but we use the same script for consistency with FF-DNN code +# +# use_multitask_egs : True, if different examples used to train multiple +# tasks or outputs, e.g.multilingual training. +# multilingual egs can be generated using get_egs.sh and +# steps/nnet3/multilingual/allocate_multilingual_examples.py, +# those are the top-level scripts. +# """ +# +# deriv_time_opts = [] +# if min_deriv_time is not None: +# deriv_time_opts.append("--optimization.min-deriv-time={0}".format( +# min_deriv_time)) +# if max_deriv_time_relative is not None: +# deriv_time_opts.append("--optimization.max-deriv-time-relative={0}".format( +# int(max_deriv_time_relative))) +# +# threads = [] +# # the GPU timing info is only printed if we use the --verbose=1 flag; this +# # slows down the computation slightly, so don't accumulate it on every +# # iteration. Don't do it on iteration 0 either, because we use a smaller +# # than normal minibatch size, and people may get confused thinking it's +# # slower for iteration 0 because of the verbose option. +# verbose_opt = ("--verbose=1" if iter % 20 == 0 and iter > 0 else "") +# +# for job in range(1, num_jobs+1): +# # k is a zero-based index that we will derive the other indexes from. +# k = num_archives_processed + job - 1 +# # work out the 1-based archive index. +# archive_index = (k % num_archives) + 1 +# # previous : frame_shift = (k/num_archives) % frame_subsampling_factor +# frame_shift = ((archive_index + k/num_archives) +# % frame_subsampling_factor) +# +# multitask_egs_opts = common_train_lib.get_multitask_egs_opts( +# egs_dir, +# egs_prefix="cegs.", +# archive_index=archive_index, +# use_multitask_egs=use_multitask_egs) +# scp_or_ark = "scp" if use_multitask_egs else "ark" +# cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir, +# iter=iter) +# if iter > 0 else "") + +# (" --write-cache={0}/cache.{1}".format(dir, iter + 1) +# if job == 1 else "")) +# +# if truncate_deriv_weights > 0: +# raise NotImplementedError +# +# thread = common_lib.background_command( +# """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ +# nnet3-chain-train-post {parallel_train_opts} {verbose_opt} \ +# --apply-deriv-weights={app_deriv_wts} {objective_opts} \ +# --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ +# {cache_io_opts} --xent-regularize={xent_reg} \ +# {deriv_time_opts} \ +# --print-interval=10 --momentum={momentum} \ +# --max-param-change={max_param_change} \ +# --backstitch-training-scale={backstitch_training_scale} \ +# --backstitch-training-interval={backstitch_training_interval} \ +# --l2-regularize-factor={l2_regularize_factor} \ +# --srand={srand} \ +# "{raw_model}" {dir}/den.fst \ +# "ark,bg:nnet3-copy-egs {multitask_egs_opts} \ +# --frame-shift={fr_shft} \ +# {scp_or_ark}:{egs_dir}/egs.{archive_index}.{scp_or_ark} ark:- | \ +# nnet3-shuffle-egs --buffer-size={buf_size} \ +# --srand={srand} ark:- ark:- | nnet3-merge-egs --sort-by-t \ +# --minibatch-size={num_chunk_per_mb} ark:- ark:- |" \ +# {dir}/{next_iter}.{job}.raw""".format( +# command=run_opts.command, +# train_queue_opt=run_opts.train_queue_opt, +# dir=dir, iter=iter, srand=iter + srand, +# next_iter=iter + 1, job=job, +# deriv_time_opts=" ".join(deriv_time_opts), +# app_deriv_wts=apply_deriv_weights, +# fr_shft=frame_shift, l2=l2_regularize, +# xent_reg=xent_regularize, leaky=leaky_hmm_coefficient, +# cache_io_opts=cache_io_opts, +# parallel_train_opts=run_opts.parallel_train_opts, +# verbose_opt=verbose_opt, +# momentum=momentum, max_param_change=max_param_change, +# backstitch_training_scale=backstitch_training_scale, +# backstitch_training_interval=backstitch_training_interval, +# l2_regularize_factor=1.0/num_jobs, +# raw_model=raw_model_string, +# egs_dir=egs_dir, archive_index=archive_index, +# buf_size=shuffle_buffer_size, +# num_chunk_per_mb=num_chunk_per_minibatch_str, +# multitask_egs_opts=multitask_egs_opts, +# scp_or_ark=scp_or_ark, +# objective_opts=objective_opts), +# require_zero_status=True) +# +# threads.append(thread) +# +# for thread in threads: +# thread.join() +# +# +#def train_one_iteration(dir, iter, srand, egs_dir, +# num_jobs, num_archives_processed, num_archives, +# learning_rate, shrinkage_value, +# num_chunk_per_minibatch_str, +# apply_deriv_weights, min_deriv_time, +# max_deriv_time_relative, +# l2_regularize, xent_regularize, +# leaky_hmm_coefficient, +# momentum, max_param_change, shuffle_buffer_size, +# frame_subsampling_factor, truncate_deriv_weights, +# run_opts, dropout_edit_string="", +# backstitch_training_scale=0.0, backstitch_training_interval=1, +# use_multitask_egs=False, +# objective_opts=""): +# """ Called from steps/nnet3/chain/train_ts.py for one iteration for +# neural network training with LF-MMI objective +# +# """ +# +# # Set off jobs doing some diagnostics, in the background. +# # Use the egs dir from the previous iteration for the diagnostics +# # check if different iterations use the same random seed +# if os.path.exists('{0}/srand'.format(dir)): +# try: +# saved_srand = int(open('{0}/srand'.format(dir)).readline().strip()) +# except (IOError, ValueError): +# logger.error("Exception while reading the random seed " +# "for training") +# raise +# if srand != saved_srand: +# logger.warning("The random seed provided to this iteration " +# "(srand={0}) is different from the one saved last " +# "time (srand={1}). Using srand={0}.".format( +# srand, saved_srand)) +# else: +# with open('{0}/srand'.format(dir), 'w') as f: +# f.write(str(srand)) +# +# # Sets off some background jobs to compute train and +# # validation set objectives +# compute_train_cv_probabilities( +# dir=dir, iter=iter, egs_dir=egs_dir, +# l2_regularize=l2_regularize, xent_regularize=xent_regularize, +# leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, +# use_multitask_egs=use_multitask_egs, +# objective_opts=objective_opts) +# +# if iter > 0: +# # Runs in the background +# chain_lib.compute_progress(dir, iter, run_opts) +# +# do_average = (iter > 0) +# +# raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} " +# "--scale={1} {2}/{3}.mdl - |".format( +# learning_rate, shrinkage_value, dir, iter)) +# +# if do_average: +# cur_num_chunk_per_minibatch_str = num_chunk_per_minibatch_str +# cur_max_param_change = max_param_change +# else: +# # on iteration zero, use a smaller minibatch size (and we will later +# # choose the output of just one of the jobs): the model-averaging isn't +# # always helpful when the model is changing too fast (i.e. it can worsen +# # the objective function), and the smaller minibatch size will help to +# # keep the update stable. +# cur_num_chunk_per_minibatch_str = common_train_lib.halve_minibatch_size_str( +# num_chunk_per_minibatch_str) +# cur_max_param_change = float(max_param_change) / math.sqrt(2) +# +# raw_model_string = raw_model_string + dropout_edit_string +# +# shrink_info_str = '' +# if shrinkage_value != 1.0: +# shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value) +# +# objf_info = "" if objective_opts == "" else ( +# "and objective_opts=" + objective_opts) +# logger.info("On iteration {0}, learning rate is {1}" +# "{shrink_info} {objf_info}.".format( +# iter, learning_rate, +# shrink_info=shrink_info_str, objf_info=objf_info)) +# +# train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, +# num_archives_processed=num_archives_processed, +# num_archives=num_archives, +# raw_model_string=raw_model_string, +# egs_dir=egs_dir, +# apply_deriv_weights=apply_deriv_weights, +# min_deriv_time=min_deriv_time, +# max_deriv_time_relative=max_deriv_time_relative, +# l2_regularize=l2_regularize, +# xent_regularize=xent_regularize, +# leaky_hmm_coefficient=leaky_hmm_coefficient, +# momentum=momentum, +# max_param_change=cur_max_param_change, +# shuffle_buffer_size=shuffle_buffer_size, +# num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str, +# frame_subsampling_factor=frame_subsampling_factor, +# truncate_deriv_weights=truncate_deriv_weights, +# run_opts=run_opts, +# # linearly increase backstitch_training_scale during the +# # first few iterations (hard-coded as 15) +# backstitch_training_scale=(backstitch_training_scale * +# iter / 15 if iter < 15 else backstitch_training_scale), +# backstitch_training_interval=backstitch_training_interval, +# use_multitask_egs=use_multitask_egs, +# objective_opts=objective_opts) +# +# [models_to_average, best_model] = common_train_lib.get_successful_models( +# num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) +# nnets_list = [] +# for n in models_to_average: +# nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) +# +# if do_average: +# # average the output of the different jobs. +# common_train_lib.get_average_nnet_model( +# dir=dir, iter=iter, +# nnets_list=" ".join(nnets_list), +# run_opts=run_opts) +# +# else: +# # choose the best model from different jobs +# common_train_lib.get_best_nnet_model( +# dir=dir, iter=iter, +# best_model_index=best_model, +# run_opts=run_opts) +# +# try: +# for i in range(1, num_jobs + 1): +# os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) +# except OSError: +# raise Exception("Error while trying to delete the raw models") +# +# new_model = "{0}/{1}.mdl".format(dir, iter + 1) +# +# if not os.path.isfile(new_model): +# raise Exception("Could not find {0}, at the end of " +# "iteration {1}".format(new_model, iter)) +# elif os.stat(new_model).st_size == 0: +# raise Exception("{0} has size 0. Something went wrong in " +# "iteration {1}".format(new_model, iter)) +# if os.path.exists("{0}/cache.{1}".format(dir, iter)): +# os.remove("{0}/cache.{1}".format(dir, iter)) +# +# +#def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, +# xent_regularize, leaky_hmm_coefficient, +# run_opts, +# use_multitask_egs=False, +# objective_opts=""): +# model = '{0}/{1}.mdl'.format(dir, iter) +# scp_or_ark = "scp" if use_multitask_egs else "ark" +# egs_suffix = ".scp" if use_multitask_egs else ".egs" +# +# multitask_egs_opts = common_train_lib.get_multitask_egs_opts( +# egs_dir, +# egs_prefix="valid_diagnostic.", +# use_multitask_egs=use_multitask_egs) +# +# import re +# objective_opts = re.sub(r"--mmi-factor=0.0 ", "--mmi-factor=1e-10 ", +# objective_opts) +# +# common_lib.background_command( +# """{command} {dir}/log/compute_prob_valid.{iter}.log \ +# nnet3-chain-compute-prob-post --l2-regularize={l2} {objective_opts} \ +# --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ +# {model} {dir}/den.fst \ +# "ark,bg:nnet3-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/valid_diagnostic{egs_suffix} \ +# ark:- | nnet3-merge-egs --sort-by-t --minibatch-size=1:64 ark:- ark:- |" \ +# """.format(command=run_opts.command, dir=dir, iter=iter, model=model, +# l2=l2_regularize, leaky=leaky_hmm_coefficient, +# xent_reg=xent_regularize, +# egs_dir=egs_dir, +# multitask_egs_opts=multitask_egs_opts, +# scp_or_ark=scp_or_ark, egs_suffix=egs_suffix, +# objective_opts=objective_opts)) +# +# multitask_egs_opts = common_train_lib.get_multitask_egs_opts( +# egs_dir, +# egs_prefix="train_diagnostic.", +# use_multitask_egs=use_multitask_egs) +# +# common_lib.background_command( +# """{command} {dir}/log/compute_prob_train.{iter}.log \ +# nnet3-chain-compute-prob-post --l2-regularize={l2} {objective_opts} \ +# --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ +# "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ +# "ark,bg:nnet3-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/train_diagnostic{egs_suffix} \ +# ark:- | nnet3-merge-egs --sort-by-t --minibatch-size=1:64 ark:- ark:- |" \ +# """.format(command=run_opts.command, dir=dir, iter=iter, model=model, +# l2=l2_regularize, leaky=leaky_hmm_coefficient, +# xent_reg=xent_regularize, +# egs_dir=egs_dir, +# multitask_egs_opts=multitask_egs_opts, +# scp_or_ark=scp_or_ark, egs_suffix=egs_suffix, +# objective_opts=objective_opts)) +# +# +#def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, +# egs_dir, leaky_hmm_coefficient, l2_regularize, +# xent_regularize, run_opts, +# max_objective_evaluations=30, +# use_multitask_egs=False, +# objective_opts=""): +# """ Function to do model combination +# +# In the nnet3 setup, the logic +# for doing averaging of subsets of the models in the case where +# there are too many models to reliably esetimate interpolation +# factors (max_models_combine) is moved into the nnet3-combine. +# """ +# raw_model_strings = [] +# logger.info("Combining {0} models.".format(models_to_combine)) +# +# models_to_combine.add(num_iters) +# +# for iter in sorted(models_to_combine): +# model_file = '{0}/{1}.mdl'.format(dir, iter) +# if os.path.exists(model_file): +# # we used to copy them with nnet3-am-copy --raw=true, but now +# # the raw-model-reading code discards the other stuff itself. +# raw_model_strings.append(model_file) +# else: +# print("{0}: warning: model file {1} does not exist " +# "(final combination)".format(sys.argv[0], model_file)) +# +# scp_or_ark = "scp" if use_multitask_egs else "ark" +# egs_suffix = ".scp" if use_multitask_egs else ".egs" +# +# multitask_egs_opts = common_train_lib.get_multitask_egs_opts( +# egs_dir, +# egs_prefix="combine.", +# use_multitask_egs=use_multitask_egs) +# +# # We reverse the order of the raw model strings so that the freshest one +# # goes first. This is important for systems that include batch +# # normalization-- it means that the freshest batch-norm stats are used. +# # Since the batch-norm stats are not technically parameters, they are not +# # combined in the combination code, they are just obtained from the first +# # model. +# raw_model_strings = list(reversed(raw_model_strings)) +# +# common_lib.execute_command( +# """{command} {combine_queue_opt} {dir}/log/combine.log \ +# nnet3-chain-combine-post {objective_opts} \ +# --max-objective-evaluations={max_objective_evaluations} \ +# --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ +# --verbose=3 {dir}/den.fst {raw_models} \ +# "ark,bg:nnet3-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/combine{egs_suffix} ark:- | \ +# nnet3-merge-egs --sort-by-t --minibatch-size={num_chunk_per_mb} \ +# ark:- ark:- |" - \| \ +# nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl \ +# {dir}/final.mdl""".format( +# command=run_opts.command, +# combine_queue_opt=run_opts.combine_queue_opt, +# max_objective_evaluations=max_objective_evaluations, +# l2=l2_regularize, leaky=leaky_hmm_coefficient, +# dir=dir, raw_models=" ".join(raw_model_strings), +# num_chunk_per_mb=num_chunk_per_minibatch_str, +# num_iters=num_iters, +# egs_dir=egs_dir, +# multitask_egs_opts=multitask_egs_opts, +# scp_or_ark=scp_or_ark, egs_suffix=egs_suffix, +# objective_opts=objective_opts)) +# +# # Compute the probability of the final, combined model with +# # the same subset we used for the previous compute_probs, as the +# # different subsets will lead to different probs. +# compute_train_cv_probabilities( +# dir=dir, iter='final', egs_dir=egs_dir, +# l2_regularize=l2_regularize, xent_regularize=xent_regularize, +# leaky_hmm_coefficient=leaky_hmm_coefficient, +# run_opts=run_opts, +# use_multitask_egs=use_multitask_egs, +# objective_opts=objective_opts) + diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 720164e5436..fd0297c6b5e 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -14,9 +14,11 @@ import os import math import re +import sys import shutil import libs.common as common_lib +import libs.nnet3.train.dropout_schedule from libs.nnet3.train.dropout_schedule import * logger = logging.getLogger(__name__) @@ -431,7 +433,7 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id, if (feat_dim != 0 and feat_dim != egs_feat_dim) or (ivector_dim != egs_ivector_dim): raise Exception("There is mismatch between featdim/ivector_dim of " "the current experiment and the provided " - "egs directory") + "egs directory: egs_dim: {0} vs {1} and ivector_dim {2} vs {3}".format(feat_dim, egs_feat_dim, ivector_dim, egs_ivector_dim)) if (((egs_ivector_id is None) and (ivector_extractor_id is not None)) or ((egs_ivector_id is not None) and (ivector_extractor_id is None))): @@ -921,6 +923,14 @@ def __init__(self, action=common_lib.StrToBoolAction, help="Compute train and validation " "accuracy per-dim") + self.parser.add_argument("--trainer.objective-scales", + dest='objective_scales', + type=str, + action=common_lib.NullstrToNoneAction, + help="""Objective scales for the outputs + specified as a comma-separated list of pairs + :,:... + This will be passed to the training binary.""") # General options self.parser.add_argument("--stage", type=int, default=-4, @@ -938,6 +948,12 @@ def __init__(self, self.parser.add_argument("--egs.cmd", type=str, dest="egs_command", action=common_lib.NullstrToNoneAction, help="Script to launch egs jobs") + self.parser.add_argument("--combine-queue-opt", type=str, dest='combine_queue_opt', + default="", + help="Script to launch egs jobs") + self.parser.add_argument("--train-queue-opt", type=str, dest='train_queue_opt', + default="", + help="Script to launch egs jobs") self.parser.add_argument("--use-gpu", type=str, choices=["true", "false", "yes", "no", "wait"], help="Use GPU for training. " diff --git a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py index 0ad93e5977d..b89dc171a74 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py @@ -123,7 +123,7 @@ def _parse_dropout_string(dropout_str): dropout_values.reverse() for data_fraction, proportion in dropout_values: assert data_fraction <= 1.0 and data_fraction >= 0.0 - assert proportion <= 1.0 and proportion >= 0.0 + #assert proportion <= 1.0 and proportion >= 0.0 return dropout_values @@ -210,6 +210,21 @@ def _get_dropout_proportions(dropout_schedule, data_fraction): return dropout_proportions +def get_schedule_string(schedule, data_fraction): + if schedule is None: + return 0 + proportions = _get_dropout_proportions( + schedule, data_fraction) + + proportion_string = [] + + for component_name, proportion in proportions: + proportion_string.append( + "{}:{}".format(component_name, proportion)) + + return ' '.join(proportion_string) + + def get_dropout_edit_string(dropout_schedule, data_fraction, iter_): """Return an nnet3-copy --edits line to modify raw_model_string to set dropout proportions according to dropout_proportions. diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index e95de336586..2de57a33a07 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -479,6 +479,8 @@ class XconfigOutputLayer(XconfigLayerBase): ng-linear-options='' : Options, like ng-affine-options, that are passed to the LinearComponent, only in bottleneck layers (i.e. if bottleneck-dim is supplied). + offset-file='' : If specified, then an offset component replaces the + affine component and the presoftmax-scale-file. """ def __init__(self, first_token, key_to_value, prev_names=None): @@ -516,13 +518,19 @@ def set_default_configs(self): # zero values, unlike the hidden layers. 'param-stddev': 0.0, 'bias-stddev': 0.0, + 'offset-file': '' } def check_configs(self): - if self.config['dim'] <= -1: - raise RuntimeError("In output-layer, dim has invalid value {0}" - "".format(self.config['dim'])) + if self.config['offset-file'] == '': + if self.config['dim'] <= -1: + raise RuntimeError("In output-layer, dim has invalid value {0}" + "".format(self.config['dim'])) + if self.config['learning-rate-factor'] <= 0.0: + raise RuntimeError("In output-layer, learning-rate-factor has" + " invalid value {0}" + "".format(self.config['learning-rate-factor'])) if self.config['objective-type'] != 'linear' and \ self.config['objective-type'] != 'quadratic': @@ -566,6 +574,10 @@ def output_dim(self, auxiliary_output=None): # make sense. raise RuntimeError("Outputs of output-layer may not be used by other" " layers") + + if self.config['offset-file'] != '': + return self.descriptors['input']['dim'] + return self.config['dim'] def get_full_config(self): @@ -590,7 +602,8 @@ def _generate_config(self): # config-files, i.e. it contains the 'final' names of nodes. descriptor_final_string = self.descriptors['input']['final-string'] input_dim = self.descriptors['input']['dim'] - output_dim = self.config['dim'] + output_dim = (self.config['dim'] if self.config['offset-file'] == '' + else input_dim) bottleneck_dim = self.config['bottleneck-dim'] objective_type = self.config['objective-type'] include_log_softmax = self.config['include-log-softmax'] @@ -602,49 +615,65 @@ def _generate_config(self): if self.config[opt] != '': affine_options += ' {0}={1}'.format(opt, self.config[opt]) - cur_node = descriptor_final_string - cur_dim = input_dim - - if bottleneck_dim >= 0: - if bottleneck_dim == 0 or bottleneck_dim >= input_dim or bottleneck_dim >= output_dim: - raise RuntimeError("Bottleneck dim has value that does not make sense: {0}".format( - bottleneck_dim)) - # This is the bottleneck case (it doesn't necessarily imply we - # will be using the features from the bottleneck; it's just a factorization - # of the matrix into two pieces without a nonlinearity in between). - # We don't include the l2-regularize option because it's useless - # given the orthonormality constraint. - linear_options = self.config['ng-linear-options'] - for opt in [ 'learning-rate-factor', 'l2-regularize', 'max-change' ]: - if self.config[opt] != '': - linear_options += ' {0}={1}'.format(opt, self.config[opt]) - - - # note: by default the LinearComponent uses natural gradient. - line = ('component name={0}.linear type=LinearComponent ' - 'orthonormal-constraint={1} param-stddev={2} ' - 'input-dim={3} output-dim={4} max-change=0.75 {5}' - ''.format(self.name, self.config['orthonormal-constraint'], - self.config['orthonormal-constraint'] / math.sqrt(input_dim), - input_dim, bottleneck_dim, linear_options)) + if self.config['offset-file'] == '': + cur_node = descriptor_final_string + cur_dim = input_dim + + if bottleneck_dim >= 0: + if bottleneck_dim == 0 or bottleneck_dim >= input_dim or bottleneck_dim >= output_dim: + raise RuntimeError("Bottleneck dim has value that does not make sense: {0}".format( + bottleneck_dim)) + # This is the bottleneck case (it doesn't necessarily imply we + # will be using the features from the bottleneck; it's just a factorization + # of the matrix into two pieces without a nonlinearity in between). + # We don't include the l2-regularize option because it's useless + # given the orthonormality constraint. + linear_options = self.config['ng-linear-options'] + for opt in [ 'learning-rate-factor', 'l2-regularize', 'max-change' ]: + if self.config[opt] != '': + linear_options += ' {0}={1}'.format(opt, self.config[opt]) + + + # note: by default the LinearComponent uses natural gradient. + line = ('component name={0}.linear type=LinearComponent ' + 'orthonormal-constraint={1} param-stddev={2} ' + 'input-dim={3} output-dim={4} max-change=0.75 {5}' + ''.format(self.name, self.config['orthonormal-constraint'], + self.config['orthonormal-constraint'] / math.sqrt(input_dim), + input_dim, bottleneck_dim, linear_options)) + configs.append(line) + line = ('component-node name={0}.linear component={0}.linear input={1}' + ''.format(self.name, cur_node)) + configs.append(line) + cur_node = '{0}.linear'.format(self.name) + cur_dim = bottleneck_dim + + + line = ('component name={0}.affine' + ' type=NaturalGradientAffineComponent' + ' input-dim={1} output-dim={2} {3}' + ''.format(self.name, cur_dim, output_dim, affine_options)) configs.append(line) - line = ('component-node name={0}.linear component={0}.linear input={1}' + line = ('component-node name={0}.affine' + ' component={0}.affine input={1}' ''.format(self.name, cur_node)) configs.append(line) - cur_node = '{0}.linear'.format(self.name) - cur_dim = bottleneck_dim - + cur_node = '{0}.affine'.format(self.name) + else: + line = ('component name={0}.offset' + ' type=PerElementOffsetComponent' + ' vector={1}' + ' max-change={2} {3} {4} {5}' + ''.format(self.name, self.config['offset-file'], + max_change, ng_affine_options, + learning_rate_option, l2_regularize_option)) + configs.append(line) - line = ('component name={0}.affine' - ' type=NaturalGradientAffineComponent' - ' input-dim={1} output-dim={2} {3}' - ''.format(self.name, cur_dim, output_dim, affine_options)) - configs.append(line) - line = ('component-node name={0}.affine' - ' component={0}.affine input={1}' - ''.format(self.name, cur_node)) - configs.append(line) - cur_node = '{0}.affine'.format(self.name) + line = ('component-node name={0}.offset' + ' component={0}.offset input={1}' + ''.format(self.name, descriptor_final_string)) + configs.append(line) + cur_node = '{0}.offset'.format(self.name) if include_log_softmax: line = ('component name={0}.log-softmax' diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa.sh b/egs/wsj/s5/steps/lmrescore_const_arpa.sh index 796ff5fc95c..de9fa481aa2 100755 --- a/egs/wsj/s5/steps/lmrescore_const_arpa.sh +++ b/egs/wsj/s5/steps/lmrescore_const_arpa.sh @@ -10,6 +10,11 @@ cmd=run.pl skip_scoring=false stage=1 scoring_opts= +write_compact=true +acwt=0.1 +beam=8.0 +read_determinized=true +write_determinized=true # End configuration section. echo "$0 $@" # Print the command line for logging @@ -51,12 +56,28 @@ mkdir -p $outdir/log nj=`cat $indir/num_jobs` || exit 1; cp $indir/num_jobs $outdir +lats_rspecifier="ark:gunzip -c $indir/lat.JOB.gz|" +if ! $read_determinized; then + lats_rspecifier="$lats_rspecifier lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam --write-compact=$write_compact ark:- ark:- |" +fi + +lattice_copy_cmd= +if ! $write_determinized; then + if $read_determinized; then + echo "$0: --write-determinized false does not make sense when --read-determinized true is specified" + echo "$0: ignoring the option --write-determinized" + else + lattice_copy_cmd="ark:- | lattice-interp --alpha=0 --alpha-acoustic=1.0 --write-compact=$write_compact \"ark:gunzip -c $indir/lat.JOB.gz |\" ark,s,cs:- " + fi +fi + if [ $stage -le 1 ]; then $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ - lattice-lmrescore --lm-scale=-1.0 \ - "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:- \| \ - lattice-lmrescore-const-arpa --lm-scale=1.0 \ - ark:- "$newlm" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; + lattice-lmrescore --lm-scale=-1.0 --write-compact=$write_compact \ + "$lats_rspecifier" "$oldlmcommand" ark:- \| \ + lattice-lmrescore-const-arpa --lm-scale=1.0 --write-compact=$write_compact \ + ark:- "$newlm" $lattice_copy_cmd \ + "ark:|gzip -c>$outdir/lat.JOB.gz" || exit 1; fi if ! $skip_scoring && [ $stage -le 2 ]; then diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh index a075b8debe8..933aa9f7cdd 100755 --- a/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh +++ b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh @@ -21,6 +21,7 @@ # Begin configuration section. cmd=run.pl +keep_subsplit=false skip_scoring=false stage=1 scoring_opts= @@ -73,25 +74,88 @@ fi oldlmcommand="fstproject --project_output=true $oldlm |" mkdir -p $outdir/log -nj=`cat $indir/num_jobs` || exit 1; +nj=$(cat $indir/num_jobs) || exit 1; cp $indir/num_jobs $outdir -lats_rspecifier="ark:gunzip -c $indir/lat.JOB.gz |" - -lats_wspecifier="ark:| gzip -c > $outdir/lat.JOB.gz" +sub_split=1 +if [ -f $indir/sub_split ]; then + sub_split=$(cat $indir/sub_split) || exit 1 +fi if [ $stage -le 1 ]; then - $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam \ - "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \ - lattice-scale --lm-scale=0.0 --acoustic-scale=0.0 ark:- ark:- \| \ - lattice-lmrescore --lm-scale=-1.0 ark:- "$oldlmcommand" ark:- \| \ - lattice-lmrescore-const-arpa --lm-scale=1.0 \ - ark:- "$newlm" ark:- \| \ - lattice-project ark:- ark:- \| \ - lattice-compose --write-compact=$write_compact \ - "$lats_rspecifier" \ - ark,s,cs:- "$lats_wspecifier" || exit 1 + if [ $sub_split -eq 1 ]; then + lats_rspecifier="ark:gunzip -c $indir/lat.JOB.gz |" + lats_wspecifier="ark:| gzip -c > $outdir/lat.JOB.gz" + + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam \ + "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \ + lattice-scale --lm-scale=0.0 --acoustic-scale=0.0 ark:- ark:- \| \ + lattice-lmrescore --lm-scale=-1.0 ark:- "$oldlmcommand" ark:- \| \ + lattice-lmrescore-const-arpa --lm-scale=1.0 \ + ark:- "$newlm" ark:- \| \ + lattice-project ark:- ark:- \| \ + lattice-compose --write-compact=$write_compact \ + "$lats_rspecifier" \ + ark,s,cs:- "$lats_wspecifier" || exit 1 + else + # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim + # to have at most two jobs running at each time. The idea is that if we have + # stragglers from one job, we can be processing another one at the same time. + rm $dir/.error 2>/dev/null + + prev_pid= + for n in $(seq $[nj+1]); do + lats_rspecifier="ark:gunzip -c $indir/lat.$n.JOB.gz |" + lats_wspecifier="ark:| gzip -c > $outdir/lat.$n.JOB.gz" + + if [ $n -gt $nj ]; then + this_pid= + elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $model ]; then + echo "$0: Not processing subset $n as already done (delete $dir/.done.$n if not)"; + this_pid= + else + mkdir -p $dir/log/$n + mkdir -p $dir/part + + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam \ + "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \ + lattice-scale --lm-scale=0.0 --acoustic-scale=0.0 ark:- ark:- \| \ + lattice-lmrescore --lm-scale=-1.0 ark:- "$oldlmcommand" ark:- \| \ + lattice-lmrescore-const-arpa --lm-scale=1.0 \ + ark:- "$newlm" ark:- \| \ + lattice-project ark:- ark:- \| \ + lattice-compose --write-compact=$write_compact \ + "$lats_rspecifier" \ + ark,s,cs:- "$lats_wspecifier" || touch $dir/.error & + this_pid=$! + fi + if [ ! -z "$prev_pid" ]; then # Wait for the previous job to merge lattices. + wait $prev_pid + [ -f $dir/.error ] && \ + echo "$0: error generating lattices" && exit 1; + + if ! $keep_subsplit; then + rm $dir/.merge_error 2>/dev/null + echo "$0: Merging archives for data subset $prev_n" + for k in $(seq $sub_split); do + gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error; + done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error; + [ -f $dir/.merge_error ] && \ + echo "$0: Merging lattices for subset $prev_n failed" && exit 1; + rm $dir/lat.$prev_n.*.gz + fi + touch $dir/.done.$prev_n + fi + prev_n=$n + prev_pid=$this_pid + done + fi +fi + +if $keep_subsplit; then + echo $sub_split > $dir/sub_split fi if ! $skip_scoring && [ $stage -le 2 ]; then diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh index 1dbcbe1a192..d45d915c65a 100755 --- a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh +++ b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh @@ -13,6 +13,10 @@ skip_scoring=false max_ngram_order=4 acwt=0.1 weight=0.5 # Interpolation weight for RNNLM. + +expand_ngram=false +beam= +write_compact=true rnnlm_ver= # End configuration section. @@ -80,20 +84,30 @@ mkdir -p $outdir/log nj=`cat $indir/num_jobs` || exit 1; cp $indir/num_jobs $outdir +lat="ark:gunzip -c $indir/lat.JOB.gz |" + +if $expand_ngram; then + lat="$lat lattice-expand-ngram --write-compact=$write_compact --n=$max_ngram_order ark:- ark:- |" +fi + +if [ ! -z "$beam" ]; then + lat="$lat lattice-prune --write-compact=$write_compact --acoustic-scale=$acwt --beam=$beam ark:- ark:- |" +fi + oldlm_weight=`perl -e "print -1.0 * $weight;"` if [ "$oldlm" == "$oldlang/G.fst" ]; then $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ - lattice-lmrescore --lm-scale=$oldlm_weight \ - "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:- \| \ - $rescoring_binary $extra_arg --lm-scale=$weight \ + lattice-lmrescore --lm-scale=$oldlm_weight --write-compact=$write_compact \ + "$lat" "$oldlm_command" ark:- \| \ + $rescoring_binary $extra_arg --lm-scale=$weight --write-compact=$write_compact \ --max-ngram-order=$max_ngram_order \ $first_arg $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \ "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; else $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ - lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight \ - "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm" ark:- \| \ - $rescoring_binary $extra_arg --lm-scale=$weight \ + lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight --write-compact=$write_compact \ + "$lat" "$oldlm_command" ark:- \| \ + $rescoring_binary $extra_arg --lm-scale=$weight --write-compact=$write_compact \ --max-ngram-order=$max_ngram_order \ $first_arg $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \ "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat_undeterminized.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat_undeterminized.sh new file mode 100755 index 00000000000..d4f25347db5 --- /dev/null +++ b/egs/wsj/s5/steps/lmrescore_rnnlm_lat_undeterminized.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +# Copyright 2015 Guoguo Chen +# 2017 Hainan Xu +# Apache 2.0 + +# This script rescores lattices with RNNLM. See also rnnlmrescore.sh which is +# an older script using n-best lists. + +# Begin configuration section. +cmd=run.pl +skip_scoring=false +max_ngram_order=4 +N=10 +inv_acwt=12 +weight=1.0 # Interpolation weight for RNNLM. + +expand_ngram=false +beam= +write_compact=true +# End configuration section. +rnnlm_ver= +#layer_string= + +echo "$0 $@" # Print the command line for logging + +. ./utils/parse_options.sh + +if [ $# != 5 ]; then + echo "Does language model rescoring of lattices (remove old LM, add new LM)" + echo "with RNNLM." + echo "" + echo "Usage: $0 [options] \\" + echo " " + echo " e.g.: $0 ./rnnlm data/lang_tg data/test \\" + echo " exp/tri3/test_tg exp/tri3/test_rnnlm" + echo "options: [--cmd (run.pl|queue.pl [queue opts])]" + exit 1; +fi + +[ -f path.sh ] && . ./path.sh; + +oldlang=$1 +rnnlm_dir=$2 +data=$3 +indir=$4 +outdir=$5 + +rescoring_binary=lattice-lmrescore-rnnlm + +first_arg=ark:$rnnlm_dir/unk.probs # this is for mikolov's rnnlm +extra_arg= + +if [ "$rnnlm_ver" == "cuedrnnlm" ]; then + layer_string=`cat $rnnlm_dir/layer_string | sed "s=:= =g"` + total_size=`wc -l $rnnlm_dir/unigram.counts | awk '{print $1}'` + rescoring_binary="lattice-lmrescore-cuedrnnlm" + cat $rnnlm_dir/rnnlm.input.wlist.index | tail -n +2 | awk '{print $1-1,$2}' > $rnnlm_dir/rnn.wlist + extra_arg="--full-voc-size=$total_size --layer-sizes=\"$layer_string\"" + first_arg=$rnnlm_dir/rnn.wlist +fi + +if [ "$rnnlm_ver" == "tensorflow" ]; then + rescoring_binary="lattice-lmrescore-tf-rnnlm" + first_arg="$rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final" +fi + +oldlm=$oldlang/G.fst +if [ -f $oldlang/G.carpa ]; then + oldlm=$oldlang/G.carpa +elif [ ! -f $oldlm ]; then + echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\ + exit 1; +fi + +[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1; +[ ! -f $rnnlm_dir/rnnlm ] && [ ! -d $rnnlm_dir/rnnlm ] && echo "$0: Missing file $rnnlm_dir/rnnlm" && exit 1; +[ ! -f $rnnlm_dir/unk.probs ] &&\ + echo "$0: Missing file $rnnlm_dir/unk.probs" && exit 1; +[ ! -f $oldlang/words.txt ] &&\ + echo "$0: Missing file $oldlang/words.txt" && exit 1; +! ls $indir/lat.*.gz >/dev/null &&\ + echo "$0: No lattices input directory $indir" && exit 1; +awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) { + print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \ + || exit 1; + +if [ "$oldlm" == "$oldlang/G.fst" ]; then + lmscore_removing_binary=lattice-lmrescore + oldlm="fstproject --project_output=true $oldlm |" +else + lmscore_removing_binary=lattice-lmrescore-const-arpa +fi + +acwt=`perl -e "print (1.0/$inv_acwt);"` + +mkdir -p $outdir/log +nj=`cat $indir/num_jobs` || exit 1; +cp $indir/num_jobs $outdir + +lattice_expand_cmd= +if $expand_ngram; then + lattice_expand_cmd="| lattice-expand-ngram --n=$max_ngram_order ark:- ark:-" +fi + +oldlm_weight=`perl -e "print -1.0 * $weight;"` + +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-determinize-pruned --acoustic-scale=$acwt ${beam:+--beam=$beam} \ + "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \ + lattice-scale --lm-scale=0.0 --acoustic-scale=0.0 ark:- ark:- \| \ + $lmscore_removing_binary --lm-scale=$oldlm_weight \ + ark:- "$oldlm" ark:- $lattice_expand_cmd \| \ + $rescoring_binary $extra_arg --lm-scale=$weight \ + --max-ngram-order=$max_ngram_order \ + $first_arg $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" ark:- \| \ + lattice-project ark:- ark:- \| \ + lattice-compose --write-compact=$write_compact \ + "ark:gunzip -c $indir/lat.JOB.gz |" \ + ark,s,cs:- "ark:| gzip -c > $outdir/lat.JOB.gz" || exit 1 +fi + +if ! $skip_scoring ; then + err_msg="Not scoring because local/score.sh does not exist or not executable." + [ ! -x local/score.sh ] && echo $err_msg && exit 1; + local/score.sh --cmd "$cmd" $data $oldlang $outdir +else + echo "Not scoring because requested so..." +fi + +exit 0; diff --git a/egs/wsj/s5/steps/nnet2/remove_egs.sh b/egs/wsj/s5/steps/nnet2/remove_egs.sh index 143a5d0d86a..673a0c13993 100755 --- a/egs/wsj/s5/steps/nnet2/remove_egs.sh +++ b/egs/wsj/s5/steps/nnet2/remove_egs.sh @@ -10,6 +10,12 @@ # data that's linked to as well as the soft link), and we want to not # delete the examples if someone has done "touch $dir/egs/.nodelete". +force=false + +if [ $1 == "--force" ]; then + force=true + shift +fi if [ $# != 1 ]; then echo "Usage: $0 " @@ -28,14 +34,14 @@ if [ ! -d $egs ]; then exit 1; fi -if [ -f $egs/.nodelete ]; then +if ! $force && [ -f $egs/.nodelete ]; then echo "$0: not deleting egs in $egs since $egs/.nodelete exists" exit 0; fi -for f in $egs/egs.*.ark $egs/degs.*.ark $egs/cegs.*.ark; do +for f in $egs/egs.*.ark $egs/degs.*.ark $egs/cegs{,_orig}.*.ark; do if [ -L $f ]; then rm $(dirname $f)/$(readlink $f) # this will print a warning if it fails. fi diff --git a/egs/wsj/s5/steps/nnet3/align_lats.sh b/egs/wsj/s5/steps/nnet3/align_lats.sh index 4edc38751c8..baa402e3098 100755 --- a/egs/wsj/s5/steps/nnet3/align_lats.sh +++ b/egs/wsj/s5/steps/nnet3/align_lats.sh @@ -24,7 +24,7 @@ extra_left_context_initial=-1 extra_right_context_final=-1 online_ivector_dir= graphs_scp= -generate_ali_from_lats=false # If true, alingments generated from lattices. +generate_ali_from_lats=false # If true, alingments generated from lattices. # End configuration options. echo "$0 $@" # Print the command line for logging diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree_from_lats.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree_from_lats.sh new file mode 100755 index 00000000000..6ed988062b3 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/build_tree_from_lats.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + + +# This script builds a tree for use in the 'chain' systems (although the script +# itself is pretty generic and doesn't use any 'chain' binaries). This is just +# like the first stages of a standard system, like 'train_sat.sh', except it +# does 'convert-ali' to convert alignments to a monophone topology just created +# from the 'lang' directory (in case the topology is different from where you +# got the system's alignments from), and it stops after the tree-building and +# model-initialization stage, without re-estimating the Gaussians or training +# the transitions. + + +# Begin configuration section. +stage=-5 +exit_stage=-100 # you can use this to require it to exit at the + # beginning of a specific stage. Not all values are + # supported. +cmd=run.pl +context_opts= # e.g. set this to "--context-width 5 --central-position 2" for quinphone. +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +frame_subsampling_factor=1 +alignment_subsampling_factor=1 +leftmost_questions_truncate=-1 # note: this used to default to 10, but we never + # use this option now with value != -1, and + # we're changing the default +acwt=0.1 +tree_stats_opts= +cluster_phones_opts= +repeat_frames=false +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "Usage: steps/train_sat.sh <#leaves> " + echo " e.g.: steps/train_sat.sh 2500 15000 data/train_si84 data/lang exp/tri2b_lats_si84 exp/tri3b" + echo "Main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + echo " --repeat-frames # Only affects alignment conversion at" + echo " # the end. If true, generate an " + echo " # alignment using the frame-subsampled " + echo " # topology that is repeated " + echo " # --frame-subsampling-factor times " + echo " # and interleaved, to be the same " + echo " # length as the original alignment " + echo " # (useful for cross-entropy training " + echo " # of reduced frame rate systems)." + exit 1; +fi + +numleaves=$1 +data=$2 +lang=$3 +lat_dir=$4 +dir=$5 + +for f in $data/feats.scp $lang/phones.txt $lat_dir/final.mdl $lat_dir/lat.1.gz; do + [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1; +done + +oov=`cat $lang/oov.int` +nj=`cat $lat_dir/num_jobs` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` +ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; +sdata=$data/split$nj; +splice_opts=`cat $lat_dir/splice_opts 2>/dev/null` # frame-splicing options. +cmvn_opts=`cat $lat_dir/cmvn_opts 2>/dev/null` +delta_opts=`cat $lat_dir/delta_opts 2>/dev/null` + +mkdir -p $dir/log +cp $lat_dir/splice_opts $dir 2>/dev/null # frame-splicing options. +cp $lat_dir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $lat_dir/delta_opts $dir 2>/dev/null # delta option. + +utils/lang/check_phones_compatible.sh $lang/phones.txt $lat_dir/phones.txt || exit 1; +cp $lang/phones.txt $dir || exit 1; + +echo $nj >$dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +# Set up features. + +if [ -f $lat_dir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +## Set up speaker-independent features. +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $lat_dir/final.mat ark:- ark:- |" + cp $lat_dir/final.mat $dir + cp $lat_dir/full.mat $dir 2>/dev/null + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac + +# Add fMLLR transforms if available +if [ -f $lat_dir/trans.1 ]; then + echo "$0: Using transforms from $lat_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$lat_dir/trans.JOB ark:- ark:- |" +fi + +# Do subsampling of feats, if needed +if [ $frame_subsampling_factor -gt 1 ]; then + feats="$feats subsample-feats --n=$frame_subsampling_factor ark:- ark:- |" +fi + +if [ $stage -le -5 ]; then + echo "$0: Initializing monophone model (for alignment conversion, in case topology changed)" + + [ ! -f $lang/phones/sets.int ] && exit 1; + shared_phones_opt="--shared-phones=$lang/phones/sets.int" + # get feature dimension + example_feats="`echo $feats | sed s/JOB/1/g`"; + if ! feat_dim=$(feat-to-dim "$example_feats" - 2>/dev/null) || [ -z $feat_dim ]; then + feat-to-dim "$example_feats" - # to see the error message. + echo "error getting feature dimension" + exit 1; + fi + $cmd JOB=1 $dir/log/init_mono.log \ + gmm-init-mono $shared_phones_opt "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo $feat_dim \ + $dir/mono.mdl $dir/mono.tree || exit 1; +fi + + +if [ $stage -le -4 ]; then + # Get tree stats. + echo "$0: Accumulating tree stats" + $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ + lattice-best-path --acoustic-scale=$acwt \ + "ark:gunzip -c $lat_dir/lat.JOB.gz|" ark:/dev/null ark:- \| \ + convert-ali --frame-subsampling-factor=$alignment_subsampling_factor \ + $lat_dir/final.mdl $dir/mono.mdl $dir/mono.tree ark:- ark:- \| \ + acc-tree-stats $context_opts $tree_stats_opts --ci-phones=$ciphonelist $dir/mono.mdl \ + "$feats" ark:- $dir/JOB.treeacc || exit 1; + [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1; + $cmd $dir/log/sum_tree_acc.log \ + sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1; + rm $dir/*.treeacc +fi + +if [ $stage -le -3 ] && $train_tree; then + echo "$0: Getting questions for tree clustering." + # preparing questions, roots file... + $cmd $dir/log/questions.log \ + cluster-phones $cluster_phones_opts $context_opts $dir/treeacc \ + $lang/phones/sets.int $dir/questions.int || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + $cmd $dir/log/compile_questions.log \ + compile-questions --leftmost-questions-truncate=$leftmost_questions_truncate \ + $context_opts $lang/topo $dir/questions.int $dir/questions.qst || exit 1; + + # questions_truncated.int will be needed later on when we build the phone + # language model for 'chain' training. It's a mechanism of keeping the graph + # small. + if [ $leftmost_questions_truncate -gt 0 ]; then + head -n $leftmost_questions_truncate $dir/questions.int > $dir/questions_truncated.int + else + cp $dir/questions.int $dir/questions_truncated.int + fi + + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; +fi + +if [ $stage -le -2 ]; then + echo "$0: Initializing the model" + gmm-init-model --write-occs=$dir/1.occs \ + $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; + grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning."; + rm $dir/treeacc +fi + +if [ $stage -le -1 ]; then + # Convert the alignments to the new tree. Note: we likely will not use these + # converted alignments in the CTC system directly, but they could be useful + # for other purposes. + echo "$0: Converting alignments from $lat_dir to use current tree" + $cmd JOB=1:$nj $dir/log/convert.JOB.log \ + lattice-best-path --acoustic-scale=$acwt \ + "ark:gunzip -c $lat_dir/lat.JOB.gz |" ark:/dev/null ark:- \| \ + convert-ali --repeat-frames=$repeat_frames \ + --frame-subsampling-factor=$alignment_subsampling_factor \ + $lat_dir/final.mdl $dir/1.mdl $dir/tree \ + ark:- "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +cp $dir/1.mdl $dir/final.mdl + +echo $0: Done building tree + diff --git a/egs/wsj/s5/steps/nnet3/chain/get_chain_graph_post.sh b/egs/wsj/s5/steps/nnet3/chain/get_chain_graph_post.sh new file mode 100755 index 00000000000..a582e9efc40 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/get_chain_graph_post.sh @@ -0,0 +1,36 @@ +#! /bin/bash + +# Copyright 2018 Vimal Manohar +# Apache 2.0 + +fst_scale=0.5 +acwt=0.1 +cmd=run.pl + +echo $* + +. ./cmd.sh +. utils/parse_options.sh + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/chain/tdnn exp/chain/tri5_lats exp/chain/tdnn/egs" + exit 1 +fi + +chaindir=$1 +latdir=$2 +dir=$3 + +nj=$(cat $latdir/num_jobs) || exit 1 + +lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" + +$cmd JOB=1:$nj $dir/log/get_post.JOB.log \ + chain-lattice-to-post --acoustic-scale=$acwt --fst-scale=$fst_scale \ + $chaindir/den.fst $chaindir/0.trans_mdl "$lats_rspecifier" \ + ark,scp:$dir/numerator_post.JOB.ark,$dir/numerator_post.JOB.scp || exit 1 + +for n in $(seq $nj); do + cat $dir/numerator_post.$n.scp +done > $dir/numerator_post.scp diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index 9996820d6d3..eb1fb773c9f 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -76,6 +76,7 @@ lattice_prune_beam= # If supplied, the lattices will be pruned to this b acwt=0.1 # For pruning deriv_weights_scp= generate_egs_scp=false +no_chunking=false echo "$0 $@" # Print the command line for logging @@ -132,6 +133,8 @@ dir=$4 [ ! -z "$online_ivector_dir" ] && \ extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +$no_chunking && extra_files="$extra_files $data/allowed_lengths.txt" + for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ $chaindir/{0.trans_mdl,tree,normalization.fst} $extra_files; do [ ! -f $f ] && echo "$0: no such file $f" && exit 1; @@ -151,8 +154,20 @@ mkdir -p $dir/log $dir/info # Get list of validation utterances. frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 -awk '{print $1}' $data/utt2spk | \ - utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/valid_uttlist +if $no_chunking; then + frames_per_eg=$(cat $data/allowed_lengths.txt | tr '\n' , | sed 's/,$//') + + awk '{print $1}' $data/utt2spk | \ + utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/valid_uttlist || exit 1; +else + if [ -z "$frames_per_eg" ]; then + echo "$0: --frames-per-eg is expected if --no-chunking is false" + exit 1 + fi + + awk '{print $1}' $data/utt2spk | \ + utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/valid_uttlist || exit 1; +fi len_uttlist=$(wc -l < $dir/valid_uttlist) if [ $len_uttlist -lt $num_utts_subset ]; then @@ -267,6 +282,7 @@ fi egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" [ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" [ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" +$no_chunking && egs_opts="$egs_opts --no-chunking" [ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp" diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh new file mode 100755 index 00000000000..a2054529797 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -0,0 +1,571 @@ +#!/bin/bash + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script, which will generally be called from other neural-net training +# scripts, extracts the training examples used to train the 'chain' system +# (and also the validation examples used for diagnostics), and puts them in +# separate archives. +# +# This script dumps egs with many frames of labels, controlled by the +# frames_per_eg config variable (default: 25), plus left and right context. +# Because CTC training involves alignment of data, we can't meaningfully train +# frame by frame. The supervision approach involves the time alignment, though-- +# it is just applied in a loose way, where each symbol can appear in the +# frame-range that it was in in the alignment, extended by a certain margin. +# + + +# Begin configuration section. +cmd=run.pl +frames_per_eg=25 # number of feature frames example (not counting added context). + # more->less disk space and less time preparing egs, but more + # I/O during training. +frames_overlap_per_eg=0 # number of supervised frames of overlap that we aim for per eg. + # can be useful to avoid wasted data if you're using --left-deriv-truncate + # and --right-deriv-truncate. +frame_subsampling_factor=3 # frames-per-second of features we train on divided + # by frames-per-second at output of chain model +alignment_subsampling_factor=3 # frames-per-second of input alignments divided + # by frames-per-second at output of chain model +left_context=4 # amount of left-context per eg (i.e. extra frames of input features + # not present in the output supervision). +right_context=4 # amount of right-context per eg. +constrained=true # 'constrained=true' is the traditional setup; 'constrained=false' + # gives you the 'unconstrained' egs creation in which the time + # boundaries are not enforced inside chunks. +left_context_initial=-1 # if >=0, left-context for first chunk of an utterance +right_context_final=-1 # if >=0, right-context for last chunk of an utterance +compress=true # set this to false to disable compression (e.g. if you want to see whether + # results are affected). + +num_utts_subset=300 # number of utterances in validation and training + # subsets used for shrinkage and diagnostics. +num_valid_egs_combine=0 # #validation examples for combination weights at the very end. +num_train_egs_combine=1000 # number of train examples for the above. +num_egs_diagnostic=400 # number of frames for "compute_prob" jobs +frames_per_iter=400000 # each iteration of training, see this many frames per + # job, measured at the sampling rate of the features + # used. This is just a guideline; it will pick a number + # that divides the number of samples in the entire data. + +right_tolerance= # chain right tolerance == max label delay. +left_tolerance= + +right_tolerance_silence= # Tolerances for silence phones +left_tolerance_silence= + +add_numerator_post=false + +kl_latdir= +kl_fst_scale=0.5 + +graph_posterior_rspecifier= + +stage=0 +max_jobs_run=15 # This should be set to the maximum number of nnet3-chain-get-egs jobs you are + # comfortable to run in parallel; you can increase it if your disk + # speed is greater and you have more machines. +max_shuffle_jobs_run=50 # the shuffle jobs now include the nnet3-chain-normalize-egs command, + # which is fairly CPU intensive, so we can run quite a few at once + # without overloading the disks. +srand=0 # rand seed for nnet3-chain-get-egs, nnet3-chain-copy-egs and nnet3-chain-shuffle-egs +online_ivector_dir= # can be used if we are including speaker information as iVectors. +cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, + # it doesn't make sense to use different options than were used as input to the + # LDA transform). This is used to turn off CMVN in the online-nnet experiments. +lattice_lm_scale= # If supplied, the graph/lm weight of the lattices will be + # used (with this scale) in generating supervisions + # This is 0 by default for conventional supervised training, + # but may be close to 1 for the unsupervised part of the data + # in semi-supervised training. The optimum is usually + # 0.5 for unsupervised data. +lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, + # before being used to get supervisions. +acwt=0.1 # For pruning +phone_insertion_penalty= +deriv_weights_scp= +generate_egs_scp=false +use_den_fst=false + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train exp/tri4_nnet exp/tri3_lats exp/tri4_nnet/egs" + echo "" + echo "From , 0.trans_mdl (the transition-model), tree (the tree)" + echo "and normalization.fst (the normalization FST, derived from the denominator FST)" + echo "are read." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --max-jobs-run # The maximum number of jobs you want to run in" + echo " # parallel (increase this only if you have good disk and" + echo " # network speed). default=6" +l echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --frames-per-iter <#samples;400000> # Number of frames of data to process per iteration, per" + echo " # process." + echo " --frame-subsampling-factor # factor by which num-frames at nnet output is reduced " + echo " --frames-per-eg # number of supervised frames per eg on disk" + echo " --frames-overlap-per-eg # number of supervised frames of overlap between egs" + echo " --left-context # Number of frames on left side to append for feature input" + echo " --right-context # Number of frames on right side to append for feature input" + echo " --left-context-initial # If >= 0, left-context for first chunk of an utterance" + echo " --right-context-final # If >= 0, right-context for last chunk of an utterance" + echo " --num-egs-diagnostic <#frames;4000> # Number of egs used in computing (train,valid) diagnostics" + echo " --num-valid-egs-combine <#frames;10000> # Number of egs used in getting combination weights at the" + echo " # very end." + echo " --lattice-lm-scale # If supplied, the graph/lm weight of the lattices will be " + echo " # used (with this scale) in generating supervisions" + echo " --lattice-prune-beam # If supplied, the lattices will be pruned to this beam, " + echo " # before being used to get supervisions." + echo " --acwt # Acoustic scale -- affects pruning" + echo " --deriv-weights-scp # If supplied, adds per-frame weights to the supervision." + echo " --generate-egs-scp # Generates scp files -- Required if the egs will be " + echo " # used for multilingual/multitask training." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + exit 1; +fi + +data=$1 +chaindir=$2 +latdir=$3 +dir=$4 + +# Check some files. +[ ! -z "$online_ivector_dir" ] && \ + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" + +for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ + $chaindir/{0.trans_mdl,tree,normalization.fst} $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj=$(cat $latdir/num_jobs) || exit 1 + +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log $dir/info + +# Get list of validation utterances. + +frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 +utils/data/get_utt2dur.sh $data + +cat $data/utt2dur | \ + awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; + +len_uttlist=`wc -l $dir/valid_uttlist | awk '{print $1}'` +if [ $len_uttlist -lt $num_utts_subset ]; then + echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; +fi + +if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. + # because of this stage we can again have utts with lengths less than + # frames_per_eg + echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" + echo "include all perturbed versions of the same 'real' utterances." + mv $dir/valid_uttlist $dir/valid_uttlist.tmp + utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt + cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \ + sort | uniq | utils/apply_map.pl $dir/uniq2utt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist + rm $dir/uniq2utt $dir/valid_uttlist.tmp +fi + +cat $data/utt2dur | \ + awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ + utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; +len_uttlist=`wc -l $dir/train_subset_uttlist | awk '{print $1}'` +if [ $len_uttlist -lt $num_utts_subset ]; then + echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; +fi + +## Set up features. +echo "$0: feature type is raw" +feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" +valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" +train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" +echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now. + +tree-info $chaindir/tree | grep num-pdfs | awk '{print $2}' > $dir/info/num_pdfs || exit 1 + +if [ ! -z "$online_ivector_dir" ]; then + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; + echo $ivector_dim > $dir/info/ivector_dim + steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1 + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +else + ivector_opts="" + echo 0 >$dir/info/ivector_dim +fi + +if [ $stage -le 0 ]; then + echo "$0: working out number of frames of training data" + num_frames=$(steps/nnet2/get_num_frames.sh $data) + echo $num_frames > $dir/info/num_frames + echo "$0: working out feature dim" + feats_one="$(echo $feats | sed s/JOB/1/g)" + if ! feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then + echo "Command failed (getting feature dim): feat-to-dim \"$feats_one\"" + exit 1 + fi + echo $feat_dim > $dir/info/feat_dim +else + num_frames=$(cat $dir/info/num_frames) || exit 1; + feat_dim=$(cat $dir/info/feat_dim) || exit 1; +fi + +# the + 1 is to round up, not down... we assume it doesn't divide exactly. +num_archives=$[$num_frames/$frames_per_iter+1] + +# We may have to first create a smaller number of larger archives, with number +# $num_archives_intermediate, if $num_archives is more than the maximum number +# of open filehandles that the system allows per process (ulimit -n). +# This sometimes gives a misleading answer as GridEngine sometimes changes the +# limit, so we limit it to 512. +max_open_filehandles=$(ulimit -n) || exit 1 +[ $max_open_filehandles -gt 512 ] && max_open_filehandles=512 +num_archives_intermediate=$num_archives +archives_multiple=1 +while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do + archives_multiple=$[$archives_multiple+1] + num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1; +done +# now make sure num_archives is an exact multiple of archives_multiple. +num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1; + +echo $num_archives >$dir/info/num_archives +echo $frames_per_eg >$dir/info/frames_per_eg +# Work out the number of egs per archive +egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1; +! [ $egs_per_archive -le $frames_per_iter ] && \ + echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \ + && exit 1; + +echo $egs_per_archive > $dir/info/egs_per_archive + +echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with" +echo "$0: $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)" +if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then + echo "$0: ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)" +fi + + +if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/cegs.$x.ark; done) + for x in $(seq $num_archives_intermediate); do + utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/cegs_orig.$y.$x.ark; done) + done +fi + +egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" +[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" +[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" + +[ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp" + +chain_supervision_all_opts="--supervision.frame-subsampling-factor=$alignment_subsampling_factor" +[ ! -z $right_tolerance ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --supervision.right-tolerance=$right_tolerance" + +[ ! -z $left_tolerance ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --supervision.left-tolerance=$left_tolerance" + + +chain_supervision_all_opts="$chain_supervision_all_opts --add-numerator-post=$add_numerator_post" + +normalization_fst_scale=1.0 + +lats_rspecifier="ark,s,cs:gunzip -c $latdir/lat.JOB.gz |" +if [ ! -z $lattice_prune_beam ]; then + if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then + lats_rspecifier="$lats_rspecifier lattice-1best --acoustic-scale=$acwt ark:- ark:- |" + else + lats_rspecifier="$lats_rspecifier lattice-prune --write-compact=false --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:- |" + fi +fi + +if [ ! -z "$lattice_lm_scale" ]; then + chain_supervision_all_opts="$chain_supervision_all_opts --supervision.lm-scale=$lattice_lm_scale" + + normalization_fst_scale=$(perl -e " + if ($lattice_lm_scale > 1.0 || $lattice_lm_scale < 0) { + print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; + exit(1); + } + print (1.0 - $lattice_lm_scale);") +fi + +[ ! -z $phone_insertion_penalty ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --supervision.phone-ins-penalty=$phone_insertion_penalty" + +[ ! -z $right_tolerance_silence ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --supervision.right-tolerance-silence=$right_tolerance_silence" + +[ ! -z $left_tolerance_silence ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --supervision.left-tolerance-silence=$left_tolerance_silence" + +if [ ! -z $left_tolerance_silence ] && [ ! -z $right_tolerance_silence ]; then + chain_supervision_all_opts="$chain_supervision_all_opts --supervision.silence-phones=$(cat $lang/phones/silence_phones.csl)" +fi + +if ! $constrained; then + chain_supervision_all_opts="$chain_supervision_all_opts --convert-to-unconstrained" +fi + +chain_supervision_all_opts="$chain_supervision_all_opts --acoustic-scale=$acwt" + +echo $left_context > $dir/info/left_context +echo $right_context > $dir/info/right_context +echo $left_context_initial > $dir/info/left_context_initial +echo $right_context_final > $dir/info/right_context_final + +if [ -z "$graph_posterior_rspecifier" ]; then + if [ ! -z "$kl_latdir" ]; then + if [ $stage -le 1 ]; then + steps/nnet3/chain/get_chain_graph_post.sh \ + --cmd "$cmd" --fst-scale $kl_fst_scale --acwt $acwt \ + $chaindir $kl_latdir $dir || exit 1 + fi + + if [ ! -s "$dir/numerator_post.scp" ]; then + echo "$0: Could not find $dir/numerator_post.scp. Something went wrong." + exit 1 + fi + + graph_posterior_rspecifier="scp:$dir/numerator_post.scp" + fi +fi + +if $use_den_fst; then + chain_supervision_all_opts="--den-fst=`dirname $dir`/den.fst" +fi + +if [ $stage -le 2 ]; then + echo "$0: Getting validation and training subset examples in background." + rm $dir/.error 2>/dev/null + + ( + $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \ + lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \ + --write-compact=false "$lats_rspecifier" \ + ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1 + + for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp + + $cmd $dir/log/create_valid_subset.log \ + utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ + lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ + ${graph_posterior_rspecifier:+--graph-posterior-rspecifier="$graph_posterior_rspecifier"} \ + $egs_opts $chaindir/normalization.fst \ + "$valid_feats" $chaindir/tree $chaindir/0.trans_mdl \ + ark,s,cs:- "ark:$dir/valid_all.cegs" || exit 1 & + $cmd $dir/log/create_train_subset.log \ + utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ + lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ + ${graph_posterior_rspecifier:+--graph-posterior-rspecifier="$graph_posterior_rspecifier"} \ + $egs_opts $chaindir/normalization.fst \ + "$train_subset_feats" $chaindir/tree $chaindir/0.trans_mdl \ + ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1 & + wait + sleep 5 # wait for file system to sync. + echo "... Getting subsets of validation examples for diagnostics and combination." + if $generate_egs_scp; then + valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp" + train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp" + else + valid_diagnostic_output="ark:$dir/valid_diagnostic.cegs" + train_diagnostic_output="ark:$dir/train_diagnostic.cegs" + fi + $cmd $dir/log/create_valid_subset_combine.log \ + nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \ + ark:$dir/valid_combine.cegs || exit 1 & + $cmd $dir/log/create_valid_subset_diagnostic.log \ + nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \ + $valid_diagnostic_output || exit 1 & + + $cmd $dir/log/create_train_subset_combine.log \ + nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \ + ark:$dir/train_combine.cegs || exit 1 & + $cmd $dir/log/create_train_subset_diagnostic.log \ + nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \ + $train_diagnostic_output || exit 1 & + wait + sleep 5 # wait for file system to sync. + if $generate_egs_scp; then + cat $dir/valid_combine.cegs $dir/train_combine.cegs | \ + nnet3-chain-copy-egs ark:- ark,scp:$dir/combine.cegs,$dir/combine.scp + rm $dir/{train,valid}_combine.scp + else + cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs + fi + + for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do + [ ! -s $f ] && echo "No examples in file $f" && exit 1; + done + rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs + ) || touch $dir/.error & +fi + +if [ $stage -le 4 ]; then + # create cegs_orig.*.*.ark; the first index goes to $nj, + # the second to $num_archives_intermediate. + + egs_list= + for n in $(seq $num_archives_intermediate); do + egs_list="$egs_list ark:$dir/cegs_orig.JOB.$n.ark" + done + echo "$0: Generating training examples on disk" + + normalization_fst_maybe= + if $add_numerator_post; then + normalization_fst_maybe=$chaindir/normalization.fst + fi + + # The examples will go round-robin to egs_list. Note: we omit the + # 'normalization.fst' argument while creating temporary egs: the phase of egs + # preparation that involves the normalization FST is quite CPU-intensive and + # it's more convenient to do it later, in the 'shuffle' stage. Otherwise to + # make it efficient we need to use a large 'nj', like 40, and in that case + # there can be too many small files to deal with, because the total number of + # files is the product of 'nj' by 'num_archives_intermediate', which might be + # quite large. + + $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \ + lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl \ + "$lats_rspecifier" ark:- \| \ + nnet3-chain-split-and-get-egs $chain_supervision_all_opts \ + $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ + --num-frames-overlap=$frames_overlap_per_eg \ + ${graph_posterior_rspecifier:+--graph-posterior-rspecifier="$graph_posterior_rspecifier"} \ + $normalization_fst_maybe "$feats" $chaindir/tree $chaindir/0.trans_mdl \ + ark,s,cs:- ark:- \| \ + nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1; +fi + +if [ $stage -le 5 ]; then + echo "$0: recombining and shuffling order of archives on disk" + # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and + # shuffle the order, writing to the egs.JOB.ark + + # the input is a concatenation over the input jobs. + egs_list= + for n in $(seq $nj); do + egs_list="$egs_list $dir/cegs_orig.$n.JOB.ark" + done + + normalize_egs=true + if $use_den_fst || $add_numerator_post; then + normalize_egs=false + fi + + if [ $archives_multiple == 1 ]; then # normal case. + if $generate_egs_scp; then + output_archive="ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp" + else + output_archive="ark:$dir/cegs.JOB.ark" + fi + + if $normalize_egs; then + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale \ + $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- $output_archive || exit 1; + else + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" \ + $output_archive || exit 1; + fi + + if $generate_egs_scp; then + #concatenate cegs.JOB.scp in single cegs.scp + rm -rf $dir/cegs.scp + for j in $(seq $num_archives_intermediate); do + cat $dir/cegs.$j.scp || exit 1; + done > $dir/cegs.scp || exit 1; + for f in $dir/cegs.*.scp; do rm $f; done + fi + else + # we need to shuffle the 'intermediate archives' and then split into the + # final archives. we create soft links to manage this splitting, because + # otherwise managing the output names is quite difficult (and we don't want + # to submit separate queue jobs for each intermediate archive, because then + # the --max-jobs-run option is hard to enforce). + if $generate_egs_scp; then + output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$dir/cegs.JOB.$y.ark,$dir/cegs.JOB.$y.scp; done)" + else + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)" + fi + for x in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + archive_index=$[($x-1)*$archives_multiple+$y] + # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark + ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1 + done + done + if $normalize_egs; then + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \ + nnet3-chain-copy-egs ark:- $output_archives || exit 1; + else + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-copy-egs ark:- $output_archives || exit 1; + fi + if $generate_egs_scp; then + #concatenate cegs.JOB.scp in single cegs.scp + rm -f $dir/cegs.scp + for j in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + cat $dir/cegs.$j.$y.scp || exit 1; + done + done > $dir/cegs.scp || exit 1; + for f in $dir/cegs.*.*.scp; do rm $f; done + fi + fi +fi + +wait +[ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 + +if [ $stage -le 6 ]; then + echo "$0: removing temporary archives" + ( + cd $dir + for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->") print $Y, $NF; }'); do rm $f; done + # the next statement removes them if we weren't using the soft links to a + # 'storage' directory. + rm cegs_orig.*.ark 2>/dev/null + ) + if ! $generate_egs_scp && [ $archives_multiple -gt 1 ]; then + # there are some extra soft links that we should delete. + for f in $dir/cegs.*.*.ark; do rm $f; done + fi + if [ -z "$lat_copy_src" ]; then + rm $dir/lat_special.*.ark + fi + echo "$0: removing temporary alignments and transforms" + # Ignore errors below because trans.* might not exist. + rm $dir/{ali,trans}.{ark,scp} 2>/dev/null + +fi + +echo "$0: Finished preparing training examples" diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_ts.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_ts.sh new file mode 100755 index 00000000000..01656d85070 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_ts.sh @@ -0,0 +1,492 @@ +#!/bin/bash + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script, which will generally be called from other neural-net training +# scripts, extracts the training examples used to train the 'chain' system +# (and also the validation examples used for diagnostics), and puts them in +# separate archives. +# +# This script dumps egs with many frames of labels, controlled by the +# frames_per_eg config variable (default: 25), plus left and right context. +# Because CTC training involves alignment of data, we can't meaningfully train +# frame by frame. The supervision approach involves the time alignment, though-- +# it is just applied in a loose way, where each symbol can appear in the +# frame-range that it was in in the alignment, extended by a certain margin. +# + + +# Begin configuration section. +cmd=run.pl +frames_per_eg=25 # number of feature frames example (not counting added context). + # more->less disk space and less time preparing egs, but more + # I/O during training. +frames_overlap_per_eg=0 # number of supervised frames of overlap that we aim for per eg. + # can be useful to avoid wasted data if you're using --left-deriv-truncate + # and --right-deriv-truncate. +frame_subsampling_factor=3 # frames-per-second of features we train on divided + # by frames-per-second at output of chain model +left_context=4 # amount of left-context per eg (i.e. extra frames of input features + # not present in the output supervision). +right_context=4 # amount of right-context per eg. +left_context_initial=-1 # if >=0, left-context for first chunk of an utterance +right_context_final=-1 # if >=0, right-context for last chunk of an utterance +compress=true # set this to false to disable compression (e.g. if you want to see whether + # results are affected). + +num_utts_subset=300 # number of utterances in validation and training + # subsets used for shrinkage and diagnostics. +num_valid_egs_combine=0 # #validation examples for combination weights at the very end. +num_train_egs_combine=1000 # number of train examples for the above. +num_egs_diagnostic=400 # number of frames for "compute_prob" jobs +frames_per_iter=400000 # each iteration of training, see this many frames per + # job, measured at the sampling rate of the features + # used. This is just a guideline; it will pick a number + # that divides the number of samples in the entire data. + +transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms + +stage=0 +max_jobs_run=15 # This should be set to the maximum number of jobs you are + # comfortable to run in parallel; you can increase it if your disk + # speed is greater and you have more machines. +max_shuffle_jobs_run=50 +srand=0 # rand seed for nnet3-chain-get-egs-post, nnet3-chain-copy-egs and nnet3-chain-shuffle-egs +online_ivector_dir= # can be used if we are including speaker information as iVectors. +cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, + # it doesn't make sense to use different options than were used as input to the + # LDA transform). This is used to turn off CMVN in the online-nnet experiments. +lattice_lm_scale= # If supplied, the graph/lm weight of the lattices will be + # used (with this scale) in generating supervisions +egs_weight=1.0 # The weight which determines how much each training example + # contributes to gradients while training (can be used + # to down/up-weight a dataset) +lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, + # before being used to get supervisions. +acwt=0.1 # For pruning +phone_insertion_penalty= +deriv_weights_scp= +generate_egs_scp=false +no_chunking=false + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train exp/tri4_nnet exp/tri3_lats exp/tri4_nnet/egs" + echo "" + echo "From , 0.trans_mdl (the transition-model), tree (the tree)" + echo "and normalization.fst (the normalization FST, derived from the denominator FST)" + echo "are read." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --max-jobs-run # The maximum number of jobs you want to run in" + echo " # parallel (increase this only if you have good disk and" + echo " # network speed). default=6" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --frames-per-iter <#samples;400000> # Number of frames of data to process per iteration, per" + echo " # process." + echo " --frame-subsampling-factor # factor by which num-frames at nnet output is reduced " + echo " --frames-per-eg # number of supervised frames per eg on disk" + echo " --frames-overlap-per-eg # number of supervised frames of overlap between egs" + echo " --left-context # Number of frames on left side to append for feature input" + echo " --right-context # Number of frames on right side to append for feature input" + echo " --left-context-initial # If >= 0, left-context for first chunk of an utterance" + echo " --right-context-final # If >= 0, right-context for last chunk of an utterance" + echo " --num-egs-diagnostic <#frames;4000> # Number of egs used in computing (train,valid) diagnostics" + echo " --num-valid-egs-combine <#frames;10000> # Number of egs used in getting combination weights at the" + echo " # very end." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + exit 1; +fi + +data=$1 +chaindir=$2 +latdir=$3 +dir=$4 + +# Check some files. +[ ! -z "$online_ivector_dir" ] && \ + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" + +$no_chunking && extra_files="$extra_files $data/allowed_lengths.txt" + +for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ + $chaindir/{0.trans_mdl,tree,normalization.fst} $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj=$(cat $latdir/num_jobs) || exit 1 + +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log $dir/info + +# Get list of validation utterances. + +frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 +utils/data/get_utt2dur.sh $data + +if $no_chunking; then + frames_per_eg=$(cat $data/allowed_lengths.txt | tr '\n' , | sed 's/,$//') + + cut -d ' ' -f 1 $data/utt2spk | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; +else + cat $data/utt2dur | \ + awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; +fi + +len_uttlist=`wc -l $dir/valid_uttlist | awk '{print $1}'` +if [ $len_uttlist -lt $num_utts_subset ]; then + echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; +fi + +if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. + # because of this stage we can again have utts with lengths less than + # frames_per_eg + echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" + echo "include all perturbed versions of the same 'real' utterances." + mv $dir/valid_uttlist $dir/valid_uttlist.tmp + utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt + cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \ + sort | uniq | utils/apply_map.pl $dir/uniq2utt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist + rm $dir/uniq2utt $dir/valid_uttlist.tmp +fi + +if $no_chunking; then + cut -d ' ' -f 1 $data/utt2spk | \ + utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; +else + cat $data/utt2dur | \ + awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ + utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; +fi + +len_uttlist=`wc -l $dir/train_subset_uttlist | awk '{print $1}'` +if [ $len_uttlist -lt $num_utts_subset ]; then + echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; +fi + +[ -z "$transform_dir" ] && transform_dir=$latdir + +# because we'll need the features with a different number of jobs than $latdir, +# copy to ark,scp. +if [ -f $transform_dir/raw_trans.1 ]; then + echo "$0: using raw transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi + +## Set up features. +echo "$0: feature type is raw" +feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" +valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" +train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" +echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now. + +if [ -f $dir/trans.scp ]; then + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" + valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" +fi + +tree-info $chaindir/tree | grep num-pdfs | awk '{print $2}' > $dir/info/num_pdfs || exit 1 + +if [ ! -z "$online_ivector_dir" ]; then + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; + echo $ivector_dim > $dir/info/ivector_dim + steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1 + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +else + ivector_opts="" + echo 0 >$dir/info/ivector_dim +fi + +if [ $stage -le 1 ]; then + echo "$0: working out number of frames of training data" + num_frames=$(steps/nnet2/get_num_frames.sh $data) + echo $num_frames > $dir/info/num_frames + echo "$0: working out feature dim" + feats_one="$(echo $feats | sed s/JOB/1/g)" + if ! feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then + echo "Command failed (getting feature dim): feat-to-dim \"$feats_one\"" + exit 1 + fi + echo $feat_dim > $dir/info/feat_dim +else + num_frames=$(cat $dir/info/num_frames) || exit 1; + feat_dim=$(cat $dir/info/feat_dim) || exit 1; +fi + +# the + 1 is to round up, not down... we assume it doesn't divide exactly. +num_archives=$[$num_frames/$frames_per_iter+1] + +# We may have to first create a smaller number of larger archives, with number +# $num_archives_intermediate, if $num_archives is more than the maximum number +# of open filehandles that the system allows per process (ulimit -n). +# This sometimes gives a misleading answer as GridEngine sometimes changes the +# limit, so we limit it to 512. +max_open_filehandles=$(ulimit -n) || exit 1 +[ $max_open_filehandles -gt 512 ] && max_open_filehandles=512 +num_archives_intermediate=$num_archives +archives_multiple=1 +while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do + archives_multiple=$[$archives_multiple+1] + num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1; +done +# now make sure num_archives is an exact multiple of archives_multiple. +num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1; + +echo $num_archives >$dir/info/num_archives +echo $frames_per_eg >$dir/info/frames_per_eg +# Work out the number of egs per archive +egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1; +! [ $egs_per_archive -le $frames_per_iter ] && \ + echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \ + && exit 1; + +echo $egs_per_archive > $dir/info/egs_per_archive + +echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with" +echo "$0: $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)" +if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then + echo "$0: ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)" +fi + + +if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/cegs.$x.ark; done) + for x in $(seq $num_archives_intermediate); do + utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/cegs_orig.$y.$x.ark; done) + done +fi + +egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" +[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" +[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" +$no_chunking && egs_opts="$egs_opts --no-chunking" + +[ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp" + +chain_supervision_all_opts="--acoustic-scale=$acwt" + +normalization_scale=1.0 + +lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" +if [ ! -z $lattice_prune_beam ]; then + if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then + lats_rspecifier="$lats_rspecifier lattice-1best --acoustic-scale=$acwt ark:- ark:- |" + else + lats_rspecifier="$lats_rspecifier lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:- |" + fi +fi + +if [ ! -z "$lattice_lm_scale" ]; then + chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" +fi + +[ ! -z $phone_insertion_penalty ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --phone-ins-penalty=$phone_insertion_penalty" + +echo $left_context > $dir/info/left_context +echo $right_context > $dir/info/right_context +echo $left_context_initial > $dir/info/left_context_initial +echo $right_context_final > $dir/info/right_context_final + +if true || [ $stage -le 2 ]; then + echo "$0: Getting validation and training subset examples in background." + rm $dir/.error 2>/dev/null + + ( + $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/lattice_copy.JOB.log \ + lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \ + "$lats_rspecifier" \ + ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1 + + for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp + + $cmd $dir/log/create_valid_subset.log \ + utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ + nnet3-chain-get-egs-post $chain_supervision_all_opts $ivector_opts --srand=$srand \ + $egs_opts $chaindir/normalization.fst \ + $chaindir/0.trans_mdl "$valid_feats" scp:- "ark:$dir/valid_all.cegs" || exit 1 & + $cmd $dir/log/create_train_subset.log \ + utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ + nnet3-chain-get-egs-post $chain_supervision_all_opts $ivector_opts --srand=$srand \ + $egs_opts $chaindir/normalization.fst \ + $chaindir/0.trans_mdl "$train_subset_feats" scp:- \ + "ark:$dir/train_subset_all.cegs" || exit 1 & + wait + sleep 5 # wait for file system to sync. + echo "... Getting subsets of validation examples for diagnostics and combination." + if $generate_egs_scp; then + valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp" + train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp" + else + valid_diagnostic_output="ark:$dir/valid_diagnostic.cegs" + train_diagnostic_output="ark:$dir/train_diagnostic.cegs" + fi + $cmd $dir/log/create_valid_subset_combine.log \ + nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \ + ark:$dir/valid_combine.cegs || exit 1 & + $cmd $dir/log/create_valid_subset_diagnostic.log \ + nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \ + $valid_diagnostic_output || exit 1 & + + $cmd $dir/log/create_train_subset_combine.log \ + nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \ + ark:$dir/train_combine.cegs || exit 1 & + $cmd $dir/log/create_train_subset_diagnostic.log \ + nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \ + $train_diagnostic_output || exit 1 & + wait + sleep 5 # wait for file system to sync. + if $generate_egs_scp; then + cat $dir/valid_combine.cegs $dir/train_combine.cegs | \ + nnet3-chain-copy-egs ark:- ark,scp:$dir/combine.cegs,$dir/combine.scp + rm $dir/{train,valid}_combine.scp + else + cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs + fi + + for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do + [ ! -s $f ] && echo "No examples in file $f" && exit 1; + done + rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs + ) || touch $dir/.error & +fi + +if [ $stage -le 4 ]; then + # create cegs_orig.*.*.ark; the first index goes to $nj, + # the second to $num_archives_intermediate. + + egs_list= + for n in $(seq $num_archives_intermediate); do + egs_list="$egs_list ark:$dir/cegs_orig.JOB.$n.ark" + done + echo "$0: Generating training examples on disk" + + # The examples will go round-robin to egs_list. Note: we omit the + # 'normalization.fst' argument while creating temporary egs: the phase of egs + # preparation that involves the normalization FST is quite CPU-intensive and + # it's more convenient to do it later, in the 'shuffle' stage. Otherwise to + # make it efficient we need to use a large 'nj', like 40, and in that case + # there can be too many small files to deal with, because the total number of + # files is the product of 'nj' by 'num_archives_intermediate', which might be + # quite large. + + $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \ + nnet3-chain-get-egs-post $chain_supervision_all_opts $ivector_opts --srand=\$[JOB+$srand] \ + $egs_opts --num-frames-overlap=$frames_overlap_per_eg \ + $chaindir/normalization.fst \ + $chaindir/0.trans_mdl "$feats" \ + "$lats_rspecifier" ark:- \| \ + nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1; +fi + +if [ $stage -le 5 ]; then + echo "$0: recombining and shuffling order of archives on disk" + # combine all the "cegs_orig.*.JOB.scp" (over the $nj splits of the data) and + # shuffle the order, writing to the cegs.JOB.ark + + # the input is a concatenation over the input jobs. + egs_list= + for n in $(seq $nj); do + egs_list="$egs_list $dir/cegs_orig.$n.JOB.ark" + done + + if [ $archives_multiple == 1 ]; then # normal case. + if $generate_egs_scp; then + output_archive="ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp" + else + output_archive="ark:$dir/cegs.JOB.ark" + fi + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" $output_archive || exit 1; + + if $generate_egs_scp; then + #concatenate cegs.JOB.scp in single cegs.scp + rm -rf $dir/cegs.scp + for j in $(seq $num_archives_intermediate); do + cat $dir/cegs.$j.scp || exit 1; + done > $dir/cegs.scp || exit 1; + for f in $dir/cegs.*.scp; do rm $f; done + fi + else + # we need to shuffle the 'intermediate archives' and then split into the + # final archives. we create soft links to manage this splitting, because + # otherwise managing the output names is quite difficult (and we don't want + # to submit separate queue jobs for each intermediate archive, because then + # the --max-jobs-run option is hard to enforce). + if $generate_egs_scp; then + output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$dir/cegs.JOB.$y.ark,$dir/cegs.JOB.$y.scp; done)" + else + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)" + fi + for x in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + archive_index=$[($x-1)*$archives_multiple+$y] + # cegs.intermediate_archive.{1,2,...}.ark will point to cegs.archive.ark + ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1 + done + done + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-copy-egs ark:- $output_archives || exit 1; + + if $generate_egs_scp; then + #concatenate cegs.JOB.scp in single cegs.scp + rm -rf $dir/cegs.scp + for j in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + cat $dir/cegs.$j.$y.scp || exit 1; + done + done > $dir/cegs.scp || exit 1; + for f in $dir/cegs.*.*.scp; do rm $f; done + fi + fi +fi + +wait +[ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 + +if [ $stage -le 6 ]; then + echo "$0: removing temporary archives" + ( + cd $dir + for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->") print $Y, $NF; }'); do rm $f; done + # the next statement removes them if we weren't using the soft links to a + # 'storage' directory. + rm cegs_orig.*.ark 2>/dev/null + ) + if ! $generate_egs_scp && [ $archives_multiple -gt 1 ]; then + # there are some extra soft links that we should delete. + for f in $dir/cegs.*.*.ark; do rm $f; done + fi + # rm $dir/lat_special.*.ark + echo "$0: removing temporary alignments and transforms" + # Ignore errors below because trans.* might not exist. + rm $dir/{ali,trans}.{ark,scp} 2>/dev/null + +fi + +echo "$0: Finished preparing training examples" diff --git a/egs/wsj/s5/steps/nnet3/chain/make_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_den_fst.sh new file mode 100755 index 00000000000..3467e887cd5 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/make_den_fst.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# Copyright 2014-17 Vimal Manohar + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script creates denominator FST (den.fst) and normalization.fst for +# chain training. It additional copies the transition model and tree from the +# first alignment directory to the chain directory. +# This script can accept multiple sources of alignments that can be +# weighted to estimate phone LM. + +set -o pipefail + +# begin configuration section. +cmd=run.pl +stage=-10 +weights= +#end configuration section. + +help_message="Usage: "$(basename $0)" [options] [ ...] + E.g. "$(basename $0)" exp/tri1_ali exp/tri2_ali exp/chain/tdnn_1a_sp +Options: + --cmd (run.pl|queue.pl...) # specify how to run the sub-processes. +"; + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -lt 2 ]; then + printf "$help_message\n"; + exit 1; +fi + +dir=${@: -1} # last argument to the script +ali_dirs=( $@ ) # read the remaining arguments into an array +unset ali_dirs[${#ali_dirs[@]}-1] # 'pop' the last argument which is odir +num_sys=${#ali_dirs[@]} # number of systems to combine + +mkdir -p $dir/log + +ali_dir=`echo ${ali_dirs[0]} | cut -d: -f1` + +for f in $ali_dir/ali.1.gz $ali_dir/final.mdl $ali_dir/tree; do + if [ ! -f $f ]; then + echo "$0: Could not find file $f" + exit 1 + fi +done + +cp $ali_dir/tree $dir/ || exit 1 + +for n in `seq 0 $[num_sys-1]`; do + adir=${ali_dirs[$n]} + alignments+=("ark:gunzip -c $adir/ali.*.gz | ali-to-phones $adir/final.mdl ark:- ark:- |") +done + +if [ $stage -le 1 ]; then + $cmd $dir/log/make_phone_lm.log \ + chain-est-phone-lm $lm_opts --scales="$weights" \ + "${alignments[@]}" $dir/phone_lm.fst || exit 1 +fi + +if [ $stage -le 2 ]; then + copy-transition-model $ali_dir/final.mdl $dir/0.trans_mdl +fi + +if [ $stage -le 3 ]; then + $cmd $dir/log/make_den_fst.log \ + chain-make-den-fst $dir/tree $dir/0.trans_mdl \ + $dir/phone_lm.fst \ + $dir/den.fst $dir/normalization.fst || exit 1 +fi + +exit 0 diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index 7dade75a0ed..3b6371168ce 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -86,37 +86,44 @@ else fi fi -if [ $stage -le 1 ]; then - all_phones="" # will contain the names of the .gz files containing phones, - # with some members possibly repeated per the --num-repeats - # option - for n in `seq 0 $[num_alignments-1]`; do - this_num_repeats=${num_repeats_array[$n]} - this_alignment_dir=${ali_dirs[$n]} - num_jobs=$(cat $this_alignment_dir/num_jobs) - if ! [ "$this_num_repeats" -gt 0 ]; then - echo "Expected comma-separated list of integers for --num-repeats option, got '$num_repeats'" - exit 1 - fi +all_phones="" # will contain the names of the .gz files containing phones, + # with some members possibly repeated per the --num-repeats + # option +for n in `seq 0 $[num_alignments-1]`; do + this_num_repeats=${num_repeats_array[$n]} + this_alignment_dir=${ali_dirs[$n]} + num_jobs=$(cat $this_alignment_dir/num_jobs) + if ! [ "$this_num_repeats" -ge 0 ]; then + echo "Expected comma-separated list of integers for --num-repeats option, got '$num_repeats'" + exit 1 + fi + if [ $stage -le 1 ]; then for j in $(seq $num_jobs); do gunzip -c $this_alignment_dir/ali.$j.gz; done | \ ali-to-phones $this_alignment_dir/final.mdl ark:- "ark:|gzip -c >$dir/phones.$n.gz" || exit 1; + fi - all_phones="$all_phones $(for r in $(seq $this_num_repeats); do echo $dir/phones.$n.gz; done)" - done + if [ ! -s $dir/phones.$n.gz ]; then + echo "$dir/phones.$n.gz is empty or does not exist" + exit 1 + fi + all_phones="$all_phones $(for r in $(seq $this_num_repeats); do echo $dir/phones.$n.gz; done)" +done + +if [ $stage -le 2 ]; then $cmd $dir/log/make_phone_lm_fst.log \ gunzip -c $all_phones \| \ chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst || exit 1; rm $dir/phones.*.gz fi -if [ $stage -le 2 ]; then +if [ $stage -le 3 ]; then copy-transition-model ${ali_dirs[0]}/final.mdl $dir/0.trans_mdl || exit 1; fi -if [ $stage -le 3 ]; then +if [ $stage -le 4 ]; then $cmd $dir/log/make_den_fst.log \ chain-make-den-fst $dir/tree $dir/0.trans_mdl \ $dir/phone_lm.fst \ diff --git a/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh index 410a8710b2f..807a14b4fa4 100755 --- a/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh @@ -28,6 +28,11 @@ block_size=256 # This is the number of consecutive egs that we take fro # access. lang2weight= # array of weights one per input languge to scale example's output # w.r.t its input language during training. +lang2num_copies= # comma-separated list of number of copies per + # input language + # This is another way to scale the effect of + # a langauge especially when the language has + # relatively very little data. stage=0 echo "$0 $@" # Print the command line for logging @@ -67,6 +72,15 @@ if [ ${#args[@]} != $[$num_langs+1] ]; then exit 1; fi +num_copies_per_lang= +if [ ! -z "$lang2num_copies" ]; then + IFS=, read -r -a num_copies_per_lang <<< $lang2num_copies + if [ ${#num_copies_per_lang[@]} -ne $num_langs ]; then + echo "$0: --lang2num-copies must be an array of num-langs=$num_langs integers" + exit 1 + fi +fi + required="cegs.scp combine.scp train_diagnostic.scp valid_diagnostic.scp" train_scp_list= train_diagnostic_scp_list= @@ -91,12 +105,48 @@ for lang in $(seq 0 $[$num_langs-1]);do echo "$0: no such file ${multi_egs_dir[$lang]}/$f." && exit 1; fi done - num_archives=$(cat ${multi_egs_dir[$lang]}/info/num_archives) + + if [ -z "$lang2num_copies" ] || [ ${num_copies_per_lang[$lang]} -eq 1 ]; then + train_scp_list="$train_scp_list ${multi_egs_dir[$lang]}/cegs.scp" + train_diagnostic_scp_list="$train_diagnostic_scp_list ${multi_egs_dir[$lang]}/train_diagnostic.scp" + valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${multi_egs_dir[$lang]}/valid_diagnostic.scp" + combine_scp_list="$combine_scp_list ${multi_egs_dir[$lang]}/combine.scp" + num_archives=$(cat ${multi_egs_dir[$lang]}/info/num_archives) + else + rm -f $megs_dir/lang${lang}_cegs.scp $megs_dir/lang${lang}_train_diagnostic.scp \ + $megs_dir/lang${lang}_valid_diagnostic.scp $megs_dir/lang${lang}_combine.scp + + if [ $(perl -e "{print int(${num_copies_per_lang[$lang]})}") != ${num_copies_per_lang[$lang]} ]; then + echo "$0: Expected --lang2num-copies to have only integers; " + echo "$0: got ${num_copies_per_lang[$lang]} for language $lang" + exit 1 + fi + + for i in `seq ${num_copies_per_lang[$lang]}`; do + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/cegs.scp >> \ + $megs_dir/lang${lang}_cegs.scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/train_diagnostic.scp >> \ + $megs_dir/lang${lang}_train_diagnostic.scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/valid_diagnostic.scp >> \ + $megs_dir/lang${lang}_valid_diagnostic.scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/combine.scp >> \ + $megs_dir/lang${lang}_combine.scp + done + + if [ $(head -n1 $megs_dir/lang${lang}_cegs.scp | wc -w) -ne 2 ]; then + echo "$0: Incorrect format in $megs_dir/lang${lang}_cegs.scp; something went wrong!" + exit 1 + fi + + train_scp_list="$train_scp_list $megs_dir/lang${lang}_cegs.scp" + train_diagnostic_scp_list="$train_diagnostic_scp_list $megs_dir/lang${lang}_train_diagnostic.scp" + valid_diagnostic_scp_list="$valid_diagnostic_scp_list $megs_dir/lang${lang}_valid_diagnostic.scp" + combine_scp_list="$combine_scp_list $megs_dir/lang${lang}_combine.scp" + + num_archives=$(cat ${multi_egs_dir[$lang]}/info/num_archives) + num_archives=$[num_archives * ${num_copies_per_lang[$lang]}] + fi tot_num_archives=$[tot_num_archives+num_archives] - train_scp_list="$train_scp_list ${args[$lang]}/cegs.scp" - train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.scp" - valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp" - combine_scp_list="$combine_scp_list ${args[$lang]}/combine.scp" # check parameter dimension to be the same in all egs dirs for f in $check_params; do @@ -163,6 +213,6 @@ for egs_type in combine train_diagnostic valid_diagnostic; do mv $megs_dir/${egs_type}.weight.1.ark $megs_dir/${egs_type}.weight.ark || exit 1; mv $megs_dir/${egs_type}.1.scp $megs_dir/${egs_type}.scp || exit 1; done -mv $megs_dir/info/cegs.num_archives $megs_dir/info/num_archives || exit 1; -mv $megs_dir/info/cegs.num_tasks $megs_dir/info/num_tasks || exit 1; +echo $tot_num_archives > $megs_dir/info/num_archives || exit 1; +echo $num_langs > $megs_dir/info/num_tasks || exit 1; echo "$0: Finished preparing multilingual training example." diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index a832f57cd8f..066a193e770 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -52,12 +52,16 @@ def get_args(): # egs extraction options parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width', - default="20", + default=None, action=common_lib.NullstrToNoneAction, help="""Number of frames per chunk in the examples used to train the RNN. Caution: if you double this you should halve --trainer.samples-per-iter. May be a comma-separated list of alternatives: first width is the 'principal' chunk-width, used preferentially""") + parser.add_argument("--egs.get-egs-script", type=str, + dest='get_egs_script', + default='steps/nnet3/chain/get_egs.sh', + help="Script for creating egs") # chain options parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts', @@ -74,6 +78,14 @@ def get_args(): dest='xent_regularize', default=0.0, help="Weight of regularization function which is the " "cross-entropy cost the outputs.") + parser.add_argument("--chain.norm-regularize", type=str, + dest='norm_regularize', default=False, + action=common_lib.StrToBoolAction, + choices=["true", "false"], + help="""If true, instead of l2-regularization on + output of the network, we use l1-regularization on + exp(output) of the network. This tends to make + exp(output) more like probabilities.""") parser.add_argument("--chain.right-tolerance", type=int, dest='right_tolerance', default=5, help="") parser.add_argument("--chain.left-tolerance", type=int, @@ -86,6 +98,11 @@ def get_args(): action=common_lib.StrToBoolAction, choices=["true", "false"], help="") + parser.add_argument("--chain.truncate-deriv-weights", type=int, + dest='truncate_deriv_weights', default=0, + help="""Can be used to set to zero the weights of + derivs from frames near the edges. (counts subsampled + frames)""") parser.add_argument("--chain.frame-subsampling-factor", type=int, dest='frame_subsampling_factor', default=3, help="ratio of frames-per-second of features we " @@ -99,6 +116,35 @@ def get_args(): dest='left_deriv_truncate', default=None, help="Deprecated. Kept for back compatibility") + parser.add_argument("--chain.smbr-extra-opts", type=str, + dest='smbr_extra_opts', default=None, + action=common_lib.NullstrToNoneAction, + help="Some additional options related to sMBR") + parser.add_argument("--chain.smbr-factor-schedule", type=str, + dest='smbr_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for sMBR factor in LF-SMBR training.") + parser.add_argument("--chain.mmi-factor-schedule", type=str, + dest='mmi_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for MMI factor in LF-SMBR training.") + parser.add_argument("--chain.ml-factor-schedule", type=str, + dest='ml_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for ML factor in LF-SMBR training.") + parser.add_argument("--chain.kl-factor-schedule", type=str, + dest='kl_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for KL factor in LF-SMBR training.") + parser.add_argument("--chain.smbr-xent-regularize", default=None, + dest='smbr_xent_regularize', type=float, + help="Xent regularizer term used with sMBR training") + parser.add_argument("--chain.smbr-l2-regularize", default=None, + dest='smbr_l2_regularize', type=float, + help="L2 regularizer term used with sMBR training") + parser.add_argument("--chain.smbr-leaky-hmm-coefficient", type=float, + dest='smbr_leaky_hmm_coefficient', default=None, + help="") # trainer options parser.add_argument("--trainer.input-model", type=str, @@ -166,6 +212,9 @@ def get_args(): 'required' part of the chunk is defined by the model's {left,right}-context.""") + parser.add_argument("--lang", type=str, + help="Lang directory to get silence pdfs.") + # General options parser.add_argument("--feat-dir", type=str, required=True, help="Directory with features used for training " @@ -195,7 +244,8 @@ def process_args(args): """ Process the options got from get_args() """ - if not common_train_lib.validate_chunk_width(args.chunk_width): + if (args.chunk_width is not None and + not common_train_lib.validate_chunk_width(args.chunk_width)): raise Exception("--egs.chunk-width has an invalid value") if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch): @@ -236,20 +286,21 @@ def process_args(args): If you have GPUs and have nvcc installed, go to src/ and do ./configure; make""") - run_opts.train_queue_opt = "--gpu 1" + run_opts.train_queue_opt = "--gpu 1" + " " + args.train_queue_opt run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu) - run_opts.combine_queue_opt = "--gpu 1" + run_opts.combine_queue_opt = "--gpu 1" + " " + args.combine_queue_opt run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu) else: logger.warning("Without using a GPU this will be very slow. " "nnet3 does not yet support multiple threads.") - run_opts.train_queue_opt = "" + run_opts.train_queue_opt = args.train_queue_opt run_opts.parallel_train_opts = "--use-gpu=no" - run_opts.combine_queue_opt = "" + run_opts.combine_queue_opt = args.combine_queue_opt run_opts.combine_gpu_opt = "--use-gpu=no" + run_opts.command = args.command run_opts.egs_command = (args.egs_command if args.egs_command is not None else @@ -258,6 +309,36 @@ def process_args(args): return [args, run_opts] +def get_silence_pdfs(args): + if args.lang is None: + return "" + + out = common_lib.get_command_stdout( + "am-info {0}/0.trans_mdl | grep transition-ids".format(args.dir)) + num_tids = int(out.split()[-1]) + + out = common_lib.get_command_stdout( + "seq -s ' ' 0 {num_tids} | ali-to-pdf " + "{dir}/0.trans_mdl ark,t:- ark,t:-" + "".format(num_tids=num_tids-1, dir=args.dir)) + pdfs = [int(x) for x in out.split()[1:]] + + out = common_lib.get_command_stdout( + "seq -s ' ' 0 {num_tids} | ali-to-phones --per-frame " + "{dir}/0.trans_mdl ark,t:- ark,t:-" + "".format(num_tids=num_tids-1, dir=args.dir)) + phones = [int(x) for x in out.split()[1:]] + + silence_phones_list = open( + "{lang}/phones/silence.int" + "".format(lang=args.lang)).readline() + silence_phones = set([int(x) for x in silence_phones_list.split(":")]) + + silence_pdfs = list(set([str(pdfs[i]) for i, ph in enumerate(phones) + if ph in silence_phones])) + return ",".join(sorted(silence_pdfs)) + + def train(args, run_opts): """ The main function for training. @@ -356,7 +437,7 @@ def train(args, run_opts): default_egs_dir = '{0}/egs'.format(args.dir) if ((args.stage <= -3) and args.egs_dir is None): - logger.info("Generating egs") + logger.info("Generating egs using {0}".format(args.get_egs_script)) if (not os.path.exists("{0}/den.fst".format(args.dir)) or not os.path.exists("{0}/normalization.fst".format(args.dir)) or not os.path.exists("{0}/tree".format(args.dir))): @@ -376,13 +457,15 @@ def train(args, run_opts): right_tolerance=args.right_tolerance, frame_subsampling_factor=args.frame_subsampling_factor, alignment_subsampling_factor=args.alignment_subsampling_factor, - frames_per_eg_str=args.chunk_width, + frames_per_eg_str=(args.chunk_width if args.chunk_width is not None + else ""), srand=args.srand, egs_opts=args.egs_opts, cmvn_opts=args.cmvn_opts, online_ivector_dir=args.online_ivector_dir, frames_per_iter=args.frames_per_iter, - stage=args.egs_stage) + stage=args.egs_stage, + get_egs_script=args.get_egs_script) if args.egs_dir is None: egs_dir = default_egs_dir @@ -396,7 +479,7 @@ def train(args, run_opts): egs_left_context, egs_right_context, egs_left_context_initial, egs_right_context_final)) - assert(args.chunk_width == frames_per_eg_str) + assert(args.chunk_width is None or args.chunk_width == frames_per_eg_str) num_archives_expanded = num_archives * args.frame_subsampling_factor if (args.num_jobs_final > num_archives_expanded): @@ -461,6 +544,8 @@ def train(args, run_opts): max_deriv_time_relative = \ args.deriv_truncate_margin + model_right_context + silence_pdfs = get_silence_pdfs(args) + logger.info("Training will run for {0} epochs = " "{1} iterations".format(args.num_epochs, num_iters)) @@ -493,6 +578,60 @@ def train(args, run_opts): args.shrink_saturation_threshold) else shrinkage_value) + xent_regularize = args.xent_regularize + l2_regularize = args.l2_regularize + objective_opts = "" + + use_smbr_objective = False + if args.smbr_factor_schedule is not None: + smbr_factors = common_train_lib.get_schedule_string( + args.smbr_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --smbr-factors='{0}'".format(smbr_factors) + for factor in smbr_factors.split(): + parts = factor.split(":") + if parts[1] > 0.0: + use_smbr_objective = True + break + + if use_smbr_objective: + xent_regularize = (args.smbr_xent_regularize + if args.smbr_xent_regularize is not None + else args.xent_regularize) + l2_regularize = (args.smbr_l2_regularize + if args.smbr_l2_regularize is not None + else args.l2_regularize) + objective_opts += " --use-smbr-objective" + if silence_pdfs is not None: + objective_opts += " --silence-pdfs=" + silence_pdfs + if args.smbr_extra_opts is not None: + objective_opts += " " + args.smbr_extra_opts + + if args.mmi_factor_schedule is not None: + mmi_factors = common_train_lib.get_schedule_string( + args.mmi_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --mmi-factors='{0}'".format(mmi_factors) + + if args.ml_factor_schedule is not None: + ml_factors = common_train_lib.get_schedule_string( + args.ml_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --ml-factors='{0}'".format(ml_factors) + + if args.kl_factor_schedule is not None: + kl_factors = common_train_lib.get_schedule_string( + args.kl_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --kl-factors='{0}'".format(kl_factors) + + objective_opts += " --norm-regularize={0}".format( + "true" if args.norm_regularize else "false") + percent = num_archives_processed * 100.0 / num_archives_to_process epoch = (num_archives_processed * args.num_epochs / num_archives_to_process) @@ -506,6 +645,11 @@ def train(args, run_opts): percent, lrate, shrink_info_str)) + objective_opts += " --leaky-hmm-coefficient={0} {1}".format( + args.leaky_hmm_coefficient, + "" if args.smbr_leaky_hmm_coefficient is None else + "--smbr-leaky-hmm-coefficient={}".format(args.smbr_leaky_hmm_coefficient)) + chain_lib.train_one_iteration( dir=args.dir, iter=iter, @@ -525,17 +669,18 @@ def train(args, run_opts): apply_deriv_weights=args.apply_deriv_weights, min_deriv_time=min_deriv_time, max_deriv_time_relative=max_deriv_time_relative, - l2_regularize=args.l2_regularize, - xent_regularize=args.xent_regularize, - leaky_hmm_coefficient=args.leaky_hmm_coefficient, + l2_regularize=l2_regularize, + xent_regularize=xent_regularize, momentum=args.momentum, max_param_change=args.max_param_change, shuffle_buffer_size=args.shuffle_buffer_size, frame_subsampling_factor=args.frame_subsampling_factor, + truncate_deriv_weights=args.truncate_deriv_weights, run_opts=run_opts, backstitch_training_scale=args.backstitch_training_scale, backstitch_training_interval=args.backstitch_training_interval, - use_multitask_egs=use_multitask_egs) + use_multitask_egs=use_multitask_egs, + objective_opts=objective_opts) if args.cleanup: # do a clean up everything but the last 2 models, under certain @@ -559,29 +704,91 @@ def train(args, run_opts): num_archives_processed = num_archives_processed + current_num_jobs if args.stage <= num_iters: + xent_regularize = args.xent_regularize + l2_regularize = args.l2_regularize + objective_opts = ("--objective-scales=" + args.objective_scales + if args.objective_scales is not None else "") + + use_smbr_objective = False + if args.smbr_factor_schedule is not None: + smbr_factors = common_train_lib.get_schedule_string( + args.smbr_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --smbr-factors='{0}'".format(smbr_factors) + for factor in smbr_factors.split(): + parts = factor.split(":") + if parts[1] > 0.0: + use_smbr_objective = True + break + + if use_smbr_objective: + xent_regularize = (args.smbr_xent_regularize + if args.smbr_xent_regularize is not None + else args.xent_regularize) + l2_regularize = (args.smbr_l2_regularize + if args.smbr_l2_regularize is not None + else args.l2_regularize) + objective_opts += " --use-smbr-objective" + if silence_pdfs is not None: + objective_opts += " --silence-pdfs=" + silence_pdfs + if args.smbr_extra_opts is not None: + objective_opts += " " + args.smbr_extra_opts + + if args.mmi_factor_schedule is not None: + mmi_factors = common_train_lib.get_schedule_string( + args.mmi_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --mmi-factors='{0}'".format(mmi_factors) + + if args.ml_factor_schedule is not None: + ml_factors = common_train_lib.get_schedule_string( + args.ml_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --ml-factors='{0}'".format(ml_factors) + + if args.kl_factor_schedule is not None: + kl_factors = common_train_lib.get_schedule_string( + args.kl_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --kl-factors='{0}'".format(kl_factors) + + + objective_opts += " --norm-regularize={0}".format( + "true" if args.norm_regularize else "false") + + objective_opts += " --leaky-hmm-coefficient={0} {1}".format( + args.leaky_hmm_coefficient, + "" if args.smbr_leaky_hmm_coefficient is None else + "--smbr-leaky-hmm-coefficient={}".format(args.smbr_leaky_hmm_coefficient)) + if args.do_final_combination: logger.info("Doing final combination to produce final.mdl") + chain_lib.combine_models( dir=args.dir, num_iters=num_iters, models_to_combine=models_to_combine, num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, egs_dir=egs_dir, - leaky_hmm_coefficient=args.leaky_hmm_coefficient, - l2_regularize=args.l2_regularize, - xent_regularize=args.xent_regularize, + l2_regularize=l2_regularize, + xent_regularize=xent_regularize, run_opts=run_opts, max_objective_evaluations=args.max_objective_evaluations, - use_multitask_egs=use_multitask_egs) + use_multitask_egs=use_multitask_egs, + objective_opts=objective_opts) else: logger.info("Copying the last-numbered model to final.mdl") common_lib.force_symlink("{0}.mdl".format(num_iters), "{0}/final.mdl".format(args.dir)) chain_lib.compute_train_cv_probabilities( dir=args.dir, iter=num_iters, egs_dir=egs_dir, - l2_regularize=args.l2_regularize, xent_regularize=args.xent_regularize, - leaky_hmm_coefficient=args.leaky_hmm_coefficient, + l2_regularize=l2_regularize, xent_regularize=xent_regularize, run_opts=run_opts, - use_multitask_egs=use_multitask_egs) + use_multitask_egs=use_multitask_egs, + objective_opts=objective_opts) common_lib.force_symlink("compute_prob_valid.{iter}.log" "".format(iter=num_iters), "{dir}/log/compute_prob_valid.final.log".format( diff --git a/egs/wsj/s5/steps/nnet3/chain/train_ts.py b/egs/wsj/s5/steps/nnet3/chain/train_ts.py new file mode 100755 index 00000000000..d9419818534 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/train_ts.py @@ -0,0 +1,842 @@ +#!/usr/bin/env python + +# Copyright 2016 Vijayaditya Peddinti. +# 2016 Vimal Manohar +# Apache 2.0. + +""" This script is based on steps/nnet3/chain/train.sh +""" + +import argparse +import logging +import os +import pprint +import shutil +import sys +import traceback + +sys.path.insert(0, 'steps') +import libs.nnet3.train.common as common_train_lib +import libs.common as common_lib +import libs.nnet3.train.chain_objf.acoustic_model as chain_lib +import libs.nnet3.train.chain_objf.ts as ts_lib +import libs.nnet3.report.log_parse as nnet3_log_parse + + +logger = logging.getLogger('libs') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Starting chain model trainer (train.py)') + + +def get_args(): + """ Get args from stdin. + + We add compulsary arguments as named arguments for readability + + The common options are defined in the object + libs.nnet3.train.common.CommonParser.parser. + See steps/libs/nnet3/train/common.py + """ + + parser = argparse.ArgumentParser( + description="""Trains RNN and DNN acoustic models using the 'chain' + objective function.""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + conflict_handler='resolve', + parents=[common_train_lib.CommonParser().parser]) + + # egs extraction options + parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width', + default=None, action=common_lib.NullstrToNoneAction, + help="""Number of frames per chunk in the examples + used to train the RNN. Caution: if you double this you + should halve --trainer.samples-per-iter. May be + a comma-separated list of alternatives: first width + is the 'principal' chunk-width, used preferentially""") + + # chain options + parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts', + default=None, action=common_lib.NullstrToNoneAction, + help="options to be be passed to chain-est-phone-lm") + parser.add_argument("--chain.l2-regularize", type=float, + dest='l2_regularize', default=0.0, + help="""Weight of regularization function which is the + l2-norm of the output of the network. It should be used + without the log-softmax layer for the outputs. As + l2-norm of the log-softmax outputs can dominate the + objective function.""") + parser.add_argument("--chain.xent-regularize", type=float, + dest='xent_regularize', default=0.0, + help="Weight of regularization function which is the " + "cross-entropy cost the outputs.") + parser.add_argument("--chain.norm-regularize", type=str, + dest='norm_regularize', default=False, + action=common_lib.StrToBoolAction, + choices=["true", "false"], + help="""If true, instead of l2-regularization on + output of the network, we use l1-regularization on + exp(output) of the network. This tends to make + exp(output) more like probabilities.""") + parser.add_argument("--chain.leaky-hmm-coefficient", type=float, + dest='leaky_hmm_coefficient', default=0.00001, + help="") + parser.add_argument("--chain.apply-deriv-weights", type=str, + dest='apply_deriv_weights', default=True, + action=common_lib.StrToBoolAction, + choices=["true", "false"], + help="") + parser.add_argument("--chain.truncate-deriv-weights", type=int, + dest='truncate_deriv_weights', default=0, + help="""Can be used to set to zero the weights of + derivs from frames near the edges. (counts subsampled + frames)""") + parser.add_argument("--chain.frame-subsampling-factor", type=int, + dest='frame_subsampling_factor', default=3, + help="ratio of frames-per-second of features we " + "train on, to chain model's output") + parser.add_argument("--chain.left-deriv-truncate", type=int, + dest='left_deriv_truncate', + default=None, + help="Deprecated. Kept for back compatibility") + parser.add_argument("--chain.smbr-extra-opts", type=str, + dest='smbr_extra_opts', default=None, + action=common_lib.NullstrToNoneAction, + help="Some additional options related to sMBR") + parser.add_argument("--chain.smbr-factor-schedule", type=str, + dest='smbr_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for sMBR factor in LF-SMBR training.") + parser.add_argument("--chain.mmi-factor-schedule", type=str, + dest='mmi_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for MMI factor in LF-SMBR training.") + parser.add_argument("--chain.ml-factor-schedule", type=str, + dest='ml_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for ML factor in LF-SMBR training.") + parser.add_argument("--chain.kl-factor-schedule", type=str, + dest='kl_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for KL factor in LF-SMBR training.") + parser.add_argument("--chain.smbr-xent-regularize", default=None, + dest='smbr_xent_regularize', type=float, + help="Xent regularizer term used with sMBR training") + parser.add_argument("--chain.smbr-l2-regularize", default=None, + dest='smbr_l2_regularize', type=float, + help="L2 regularizer term used with sMBR training") + parser.add_argument("--chain.smbr-leaky-hmm-coefficient", type=float, + dest='smbr_leaky_hmm_coefficient', default=None, + help="") + + # trainer options + parser.add_argument("--trainer.input-model", type=str, + dest='input_model', default=None, + action=common_lib.NullstrToNoneAction, + help="If specified, this model is used as initial " + "'raw' model (0.raw in the script) instead of " + "initializing the model from the xconfig. " + "Also configs dir is not expected to exist " + "and left/right context is computed from this " + "model.") + parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs', + default=10.0, + help="Number of epochs to train the model") + parser.add_argument("--trainer.frames-per-iter", type=int, + dest='frames_per_iter', default=800000, + help="""Each iteration of training, see this many + [input] frames per job. This option is passed to + get_egs.sh. Aim for about a minute of training + time""") + + parser.add_argument("--trainer.num-chunk-per-minibatch", type=str, + dest='num_chunk_per_minibatch', default='128', + help="""Number of sequences to be processed in + parallel every minibatch. May be a more general + rule as accepted by the --minibatch-size option of + nnet3-merge-egs; run that program without args to see + the format.""") + + # Parameters for the optimization + parser.add_argument("--trainer.optimization.initial-effective-lrate", + type=float, dest='initial_effective_lrate', + default=0.0002, + help="Learning rate used during the initial iteration") + parser.add_argument("--trainer.optimization.final-effective-lrate", + type=float, dest='final_effective_lrate', + default=0.00002, + help="Learning rate used during the final iteration") + parser.add_argument("--trainer.optimization.shrink-value", type=float, + dest='shrink_value', default=1.0, + help="""Scaling factor used for scaling the parameter + matrices when the derivative averages are below the + shrink-threshold at the non-linearities. E.g. 0.99. + Only applicable when the neural net contains sigmoid or + tanh units.""") + parser.add_argument("--trainer.optimization.shrink-saturation-threshold", + type=float, + dest='shrink_saturation_threshold', default=0.40, + help="""Threshold that controls when we apply the + 'shrinkage' (i.e. scaling by shrink-value). If the + saturation of the sigmoid and tanh nonlinearities in + the neural net (as measured by + steps/nnet3/get_saturation.pl) exceeds this threshold + we scale the parameter matrices with the + shrink-value.""") + # RNN-specific training options + parser.add_argument("--trainer.deriv-truncate-margin", type=int, + dest='deriv_truncate_margin', default=None, + help="""(Relevant only for recurrent models). If + specified, gives the margin (in input frames) around + the 'required' part of each chunk that the derivatives + are backpropagated to. If unset, the derivatives are + backpropagated all the way to the boundaries of the + input data. E.g. 8 is a reasonable setting. Note: the + 'required' part of the chunk is defined by the model's + {left,right}-context.""") + parser.add_argument("--trainer.optimization.do-final-combination", + dest='do_final_combination', type=str, + action=common_lib.StrToBoolAction, + choices=["true", "false"], default=False, + help="""Set this to false to disable the final + 'combine' stage (in this case we just use the + last-numbered model as the final.mdl).""") + + parser.add_argument("--lang", type=str, + help="Lang directory to get silence pdfs.") + + # General options + parser.add_argument("--feat-dir", type=str, required=True, + help="Directory with features used for training " + "the neural network.") + parser.add_argument("--tree-dir", type=str, required=True, + help="""Directory containing the tree to use for this + model (we also expect final.mdl and ali.*.gz in that + directory""") + parser.add_argument("--lat-dir", type=str, required=True, + help="Directory with numerator lattices " + "used for training the neural network.") + parser.add_argument("--dir", type=str, required=True, + help="Directory to store the models and " + "all other files.") + + print(' '.join(sys.argv)) + print(sys.argv) + + args = parser.parse_args() + + [args, run_opts] = process_args(args) + + return [args, run_opts] + + +def process_args(args): + """ Process the options got from get_args() + """ + + if (args.chunk_width is not None and + not common_train_lib.validate_chunk_width(args.chunk_width)): + raise Exception("--egs.chunk-width has an invalid value") + + if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch): + raise Exception("--trainer.num-chunk-per-minibatch has an invalid value") + + if args.chunk_left_context < 0: + raise Exception("--egs.chunk-left-context should be non-negative") + + if args.chunk_right_context < 0: + raise Exception("--egs.chunk-right-context should be non-negative") + + if args.left_deriv_truncate is not None: + args.deriv_truncate_margin = -args.left_deriv_truncate + logger.warning( + "--chain.left-deriv-truncate (deprecated) is set by user, and " + "--trainer.deriv-truncate-margin is set to negative of that " + "value={0}. We recommend using the option " + "--trainer.deriv-truncate-margin.".format( + args.deriv_truncate_margin)) + + if (not os.path.exists(args.dir)): + raise Exception("This script expects --dir={0} to exist.") + if (not os.path.exists(args.dir+"/configs") and + (args.input_model is None or not os.path.exists(args.input_model))): + raise Exception("Either --trainer.input-model option should be supplied, " + "and exist; or the {0}/configs directory should exist." + "".format(args.dir)) + + if args.transform_dir is None: + args.transform_dir = args.lat_dir + # set the options corresponding to args.use_gpu + run_opts = common_train_lib.RunOpts() + if args.use_gpu: + if not common_lib.check_if_cuda_compiled(): + logger.warning( + """You are running with one thread but you have not compiled + for CUDA. You may be running a setup optimized for GPUs. + If you have GPUs and have nvcc installed, go to src/ and do + ./configure; make""") + + run_opts.train_queue_opt = "--gpu 1" + run_opts.parallel_train_opts = "" + run_opts.combine_queue_opt = "--gpu 1" + run_opts.combine_gpu_opt = "" + + else: + logger.warning("Without using a GPU this will be very slow. " + "nnet3 does not yet support multiple threads.") + + run_opts.train_queue_opt = "" + run_opts.parallel_train_opts = "--use-gpu=no" + run_opts.combine_queue_opt = "" + run_opts.combine_gpu_opt = "--use-gpu=no" + + run_opts.command = args.command + run_opts.egs_command = (args.egs_command + if args.egs_command is not None else + args.command) + + return [args, run_opts] + + +def get_silence_pdfs(args): + if args.lang is None: + return "" + + out = common_lib.get_command_stdout( + "am-info {0}/0.trans_mdl | grep transition-ids".format(args.dir)) + num_tids = int(out.split()[-1]) + + out = common_lib.get_command_stdout( + "seq -s ' ' 0 {num_tids} | ali-to-pdf " + "{dir}/0.trans_mdl ark,t:- ark,t:-" + "".format(num_tids=num_tids-1, dir=args.dir)) + pdfs = [int(x) for x in out.split()[1:]] + + out = common_lib.get_command_stdout( + "seq -s ' ' 0 {num_tids} | ali-to-phones --per-frame " + "{dir}/0.trans_mdl ark,t:- ark,t:-" + "".format(num_tids=num_tids-1, dir=args.dir)) + phones = [int(x) for x in out.split()[1:]] + + silence_phones_list = open( + "{lang}/phones/silence.int" + "".format(lang=args.lang)).readline() + silence_phones = set([int(x) for x in silence_phones_list.split(":")]) + + silence_pdfs = list(set([str(pdfs[i]) for i, ph in enumerate(phones) + if ph in silence_phones])) + return ",".join(sorted(silence_pdfs)) + + +def train(args, run_opts): + """ The main function for training. + + Args: + args: a Namespace object with the required parameters + obtained from the function process_args() + run_opts: RunOpts object obtained from the process_args() + """ + + arg_string = pprint.pformat(vars(args)) + logger.info("Arguments for the experiment\n{0}".format(arg_string)) + + # Check files + chain_lib.check_for_required_files(args.feat_dir, args.tree_dir, + args.lat_dir if args.egs_dir is None + else None) + + # Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will + # use it to check compatibility between training and decoding phone-sets. + shutil.copy('{0}/phones.txt'.format(args.tree_dir), args.dir) + + # Set some variables. + num_jobs = common_lib.get_number_of_jobs(args.tree_dir) + feat_dim = common_lib.get_feat_dim(args.feat_dir) + ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir) + ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir) + + # split the training data into parts for individual jobs + # we will use the same number of jobs as that used for alignment + common_lib.execute_command("utils/split_data.sh {0} {1}" + "".format(args.feat_dir, num_jobs)) + with open('{0}/num_jobs'.format(args.dir), 'w') as f: + f.write(str(num_jobs)) + + if args.input_model is None: + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) + + variables = common_train_lib.parse_generic_config_vars_file(var_file) + else: + # If args.input_model is specified, the model left and right contexts + # are computed using input_model. + variables = common_train_lib.get_input_model_info(args.input_model) + + # Set some variables. + try: + model_left_context = variables['model_left_context'] + model_right_context = variables['model_right_context'] + except KeyError as e: + raise Exception("KeyError {0}: Variables need to be defined in " + "{1}".format(str(e), '{0}/configs'.format(args.dir))) + + left_context = args.chunk_left_context + model_left_context + right_context = args.chunk_right_context + model_right_context + left_context_initial = (args.chunk_left_context_initial + model_left_context if + args.chunk_left_context_initial >= 0 else -1) + right_context_final = (args.chunk_right_context_final + model_right_context if + args.chunk_right_context_final >= 0 else -1) + + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + if (args.stage <= -6): + logger.info("Creating phone language-model") + chain_lib.create_phone_lm(args.dir, args.tree_dir, run_opts, + lm_opts=args.lm_opts) + + if (args.stage <= -5): + logger.info("Creating denominator FST") + shutil.copy('{0}/tree'.format(args.tree_dir), args.dir) + chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) + + if ((args.stage <= -4) and + os.path.exists("{0}/configs/init.config".format(args.dir)) + and (args.input_model is None)): + logger.info("Initializing a basic network for estimating " + "preconditioning matrix") + common_lib.execute_command( + """{command} {dir}/log/nnet_init.log \ + nnet3-init --srand=-2 {dir}/configs/init.config \ + {dir}/init.raw""".format(command=run_opts.command, + dir=args.dir)) + + egs_left_context = left_context + args.frame_subsampling_factor // 2 + egs_right_context = right_context + args.frame_subsampling_factor // 2 + # note: the '+ args.frame_subsampling_factor / 2' is to allow for the + # fact that we'll be shifting the data slightly during training to give + # variety to the training data. + egs_left_context_initial = (left_context_initial + + args.frame_subsampling_factor // 2 if + left_context_initial >= 0 else -1) + egs_right_context_final = (right_context_final + + args.frame_subsampling_factor // 2 if + right_context_final >= 0 else -1) + + default_egs_dir = '{0}/egs'.format(args.dir) + if ((args.stage <= -3) and args.egs_dir is None): + logger.info("Generating egs using get_egs_ts.sh") + if (not os.path.exists("{0}/den.fst".format(args.dir)) or + not os.path.exists("{0}/normalization.fst".format(args.dir)) or + not os.path.exists("{0}/tree".format(args.dir))): + raise Exception("Chain egs generation expects {0}/den.fst, " + "{0}/normalization.fst and {0}/tree " + "to exist.".format(args.dir)) + # this is where get_egs.sh is called. + ts_lib.generate_chain_egs( + dir=args.dir, data=args.feat_dir, + lat_dir=args.lat_dir, egs_dir=default_egs_dir, + left_context=egs_left_context, + right_context=egs_right_context, + left_context_initial=egs_left_context_initial, + right_context_final=egs_right_context_final, + run_opts=run_opts, + frame_subsampling_factor=args.frame_subsampling_factor, + frames_per_eg_str=(args.chunk_width if args.chunk_width is not None + else ""), + srand=args.srand, + egs_opts=args.egs_opts, + cmvn_opts=args.cmvn_opts, + online_ivector_dir=args.online_ivector_dir, + frames_per_iter=args.frames_per_iter, + transform_dir=args.transform_dir, + stage=args.egs_stage) + + if args.egs_dir is None: + egs_dir = default_egs_dir + else: + egs_dir = args.egs_dir + + [egs_left_context, egs_right_context, + frames_per_eg_str, num_archives] = ( + common_train_lib.verify_egs_dir(egs_dir, feat_dim, + ivector_dim, ivector_id, + egs_left_context, egs_right_context, + egs_left_context_initial, + egs_right_context_final)) + assert(args.chunk_width is None or args.chunk_width == frames_per_eg_str) + num_archives_expanded = num_archives * args.frame_subsampling_factor + + if (args.num_jobs_final > num_archives_expanded): + raise Exception('num_jobs_final cannot exceed the ' + 'expanded number of archives') + + # copy the properties of the egs to dir for + # use during decoding + logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir)) + common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) + + if not os.path.exists('{0}/valid_diagnostic.cegs'.format(egs_dir)): + if (not os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir))): + raise Exception('neither {0}/valid_diagnostic.cegs nor ' + '{0}/valid_diagnostic.scp exist.' + 'This script expects one of them.'.format(egs_dir)) + use_multitask_egs = True + else: + use_multitask_egs = False + + if ((args.stage <= -2) and (os.path.exists(args.dir+"/configs/init.config")) + and (args.input_model is None)): + logger.info('Computing the preconditioning matrix for input features') + + chain_lib.compute_preconditioning_matrix( + args.dir, egs_dir, num_archives, run_opts, + max_lda_jobs=args.max_lda_jobs, + rand_prune=args.rand_prune, + use_multitask_egs=use_multitask_egs) + + if (args.stage <= -1): + logger.info("Preparing the initial acoustic model.") + chain_lib.prepare_initial_acoustic_model(args.dir, run_opts, + input_model=args.input_model) + + with open("{0}/frame_subsampling_factor".format(args.dir), "w") as f: + f.write(str(args.frame_subsampling_factor)) + + # set num_iters so that as close as possible, we process the data + # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == + # $num_epochs*$num_archives, where + # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + num_archives_to_process = int(args.num_epochs * num_archives_expanded) + num_archives_processed = 0 + num_iters = ((num_archives_to_process * 2) + // (args.num_jobs_initial + args.num_jobs_final)) + + # If do_final_combination is True, compute the set of models_to_combine. + # Otherwise, models_to_combine will be none. + if args.do_final_combination: + models_to_combine = common_train_lib.get_model_combine_iters( + num_iters, args.num_epochs, + num_archives_expanded, args.max_models_combine, + args.num_jobs_final) + else: + models_to_combine = None + + min_deriv_time = None + max_deriv_time_relative = None + if args.deriv_truncate_margin is not None: + min_deriv_time = -args.deriv_truncate_margin - model_left_context + max_deriv_time_relative = \ + args.deriv_truncate_margin + model_right_context + + silence_pdfs = get_silence_pdfs(args) + + logger.info("Training will run for {0} epochs = " + "{1} iterations".format(args.num_epochs, num_iters)) + + for iter in range(num_iters): + if (args.exit_stage is not None) and (iter == args.exit_stage): + logger.info("Exiting early due to --exit-stage {0}".format(iter)) + return + current_num_jobs = int(0.5 + args.num_jobs_initial + + (args.num_jobs_final - args.num_jobs_initial) + * float(iter) / num_iters) + + if args.stage <= iter: + model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter) + + lrate = common_train_lib.get_learning_rate(iter, current_num_jobs, + num_iters, + num_archives_processed, + num_archives_to_process, + args.initial_effective_lrate, + args.final_effective_lrate) + shrinkage_value = 1.0 - (args.proportional_shrink * lrate) + if shrinkage_value <= 0.5: + raise Exception("proportional-shrink={0} is too large, it gives " + "shrink-value={1}".format(args.proportional_shrink, + shrinkage_value)) + if args.shrink_value < shrinkage_value: + shrinkage_value = (args.shrink_value + if common_train_lib.should_do_shrinkage( + iter, model_file, + args.shrink_saturation_threshold) + else shrinkage_value) + + xent_regularize = args.xent_regularize + l2_regularize = args.l2_regularize + objective_opts = "" + + use_smbr_objective = False + if args.smbr_factor_schedule is not None: + smbr_factors = common_train_lib.get_schedule_string( + args.smbr_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --smbr-factors='{0}'".format(smbr_factors) + for factor in smbr_factors.split(): + parts = factor.split(":") + if parts[1] > 0.0: + use_smbr_objective = True + break + + if use_smbr_objective: + xent_regularize = (args.smbr_xent_regularize + if args.smbr_xent_regularize is not None + else args.xent_regularize) + l2_regularize = (args.smbr_l2_regularize + if args.smbr_l2_regularize is not None + else args.l2_regularize) + objective_opts += " --use-smbr-objective" + if silence_pdfs is not None: + objective_opts += " --silence-pdfs=" + silence_pdfs + if args.smbr_extra_opts is not None: + objective_opts += " " + args.smbr_extra_opts + + if args.mmi_factor_schedule is not None: + mmi_factors = common_train_lib.get_schedule_string( + args.mmi_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --mmi-factors='{0}'".format(mmi_factors) + else: + objective_opts += " --mmi-factors='output:0'" + + if args.ml_factor_schedule is not None: + ml_factors = common_train_lib.get_schedule_string( + args.ml_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --ml-factors='{0}'".format(ml_factors) + + if args.kl_factor_schedule is not None: + kl_factors = common_train_lib.get_schedule_string( + args.kl_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --kl-factors='{0}'".format(kl_factors) + else: + objective_opts += " --kl-factors='output:1'" + + objective_opts += " --norm-regularize={0}".format( + "true" if args.norm_regularize else "false") + + percent = num_archives_processed * 100.0 / num_archives_to_process + epoch = (num_archives_processed * args.num_epochs + / num_archives_to_process) + shrink_info_str = '' + if shrinkage_value != 1.0: + shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value) + logger.info("Iter: {0}/{1} " + "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete) " + "lr: {5:0.6f} {6}".format(iter, num_iters - 1, + epoch, args.num_epochs, + percent, + lrate, shrink_info_str)) + + objective_opts += " --leaky-hmm-coefficient={0} {1}".format( + args.leaky_hmm_coefficient, + "" if args.smbr_leaky_hmm_coefficient is None else + "--smbr-leaky-hmm-coefficient={}".format(args.smbr_leaky_hmm_coefficient)) + + chain_lib.train_one_iteration( + dir=args.dir, + iter=iter, + srand=args.srand, + egs_dir=egs_dir, + num_jobs=current_num_jobs, + num_archives_processed=num_archives_processed, + num_archives=num_archives, + learning_rate=lrate, + dropout_edit_string=common_train_lib.get_dropout_edit_string( + args.dropout_schedule, + float(num_archives_processed) / num_archives_to_process, + iter), + train_opts=' '.join(args.train_opts), + shrinkage_value=shrinkage_value, + num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, + apply_deriv_weights=args.apply_deriv_weights, + min_deriv_time=min_deriv_time, + max_deriv_time_relative=max_deriv_time_relative, + l2_regularize=l2_regularize, + xent_regularize=xent_regularize, + momentum=args.momentum, + max_param_change=args.max_param_change, + shuffle_buffer_size=args.shuffle_buffer_size, + frame_subsampling_factor=args.frame_subsampling_factor, + truncate_deriv_weights=args.truncate_deriv_weights, + run_opts=run_opts, + backstitch_training_scale=args.backstitch_training_scale, + backstitch_training_interval=args.backstitch_training_interval, + use_multitask_egs=use_multitask_egs, + objective_opts=objective_opts) + + if args.cleanup: + # do a clean up everything but the last 2 models, under certain + # conditions + common_train_lib.remove_model( + args.dir, iter-2, num_iters, models_to_combine, + args.preserve_model_interval) + + if args.email is not None: + reporting_iter_interval = num_iters * args.reporting_interval + if iter % reporting_iter_interval == 0: + # lets do some reporting + [report, times, data] = ( + nnet3_log_parse.generate_acc_logprob_report( + args.dir, "log-probability")) + message = report + subject = ("Update : Expt {dir} : " + "Iter {iter}".format(dir=args.dir, iter=iter)) + common_lib.send_mail(message, subject, args.email) + + num_archives_processed = num_archives_processed + current_num_jobs + + if args.stage <= num_iters: + xent_regularize = args.xent_regularize + l2_regularize = args.l2_regularize + objective_opts = ("--objective-scales=" + args.objective_scales + if args.objective_scales is not None else "") + + use_smbr_objective = False + if args.smbr_factor_schedule is not None: + smbr_factors = common_train_lib.get_schedule_string( + args.smbr_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --smbr-factors='{0}'".format(smbr_factors) + for factor in smbr_factors.split(): + parts = factor.split(":") + if parts[1] > 0.0: + use_smbr_objective = True + break + + if use_smbr_objective: + xent_regularize = (args.smbr_xent_regularize + if args.smbr_xent_regularize is not None + else args.xent_regularize) + l2_regularize = (args.smbr_l2_regularize + if args.smbr_l2_regularize is not None + else args.l2_regularize) + objective_opts += " --use-smbr-objective" + if silence_pdfs is not None: + objective_opts += " --silence-pdfs=" + silence_pdfs + if args.smbr_extra_opts is not None: + objective_opts += " " + args.smbr_extra_opts + + if args.mmi_factor_schedule is not None: + mmi_factors = common_train_lib.get_schedule_string( + args.mmi_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --mmi-factors='{0}'".format(mmi_factors) + else: + objective_opts += " --mmi-factors='output:0'" + + if args.ml_factor_schedule is not None: + ml_factors = common_train_lib.get_schedule_string( + args.ml_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --ml-factors='{0}'".format(ml_factors) + + if args.kl_factor_schedule is not None: + kl_factors = common_train_lib.get_schedule_string( + args.kl_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --kl-factors='{0}'".format(kl_factors) + else: + objective_opts += " --kl-factors='output:1'" + + + objective_opts += " --norm-regularize={0}".format( + "true" if args.norm_regularize else "false") + + objective_opts += " --leaky-hmm-coefficient={0} {1}".format( + args.leaky_hmm_coefficient, + "" if args.smbr_leaky_hmm_coefficient is None else + "--smbr-leaky-hmm-coefficient={}".format(args.smbr_leaky_hmm_coefficient)) + + if args.do_final_combination: + logger.info("Doing final combination to produce final.mdl") + + chain_lib.combine_models( + dir=args.dir, num_iters=num_iters, + models_to_combine=models_to_combine, + num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, + egs_dir=egs_dir, + l2_regularize=l2_regularize, + xent_regularize=xent_regularize, + run_opts=run_opts, + max_objective_evaluations=args.max_objective_evaluations, + use_multitask_egs=use_multitask_egs, + objective_opts=objective_opts) + else: + logger.info("Copying the last-numbered model to final.mdl") + common_lib.force_symlink("{0}.mdl".format(num_iters), + "{0}/final.mdl".format(args.dir)) + chain_lib.compute_train_cv_probabilities( + dir=args.dir, iter=num_iters, egs_dir=egs_dir, + l2_regularize=l2_regularize, xent_regularize=xent_regularize, + run_opts=run_opts, + use_multitask_egs=use_multitask_egs, + objective_opts=objective_opts) + common_lib.force_symlink("compute_prob_valid.{iter}.log" + "".format(iter=num_iters-1), + "{dir}/log/compute_prob_valid.final.log".format( + dir=args.dir)) + + if args.cleanup: + logger.info("Cleaning up the experiment directory " + "{0}".format(args.dir)) + remove_egs = args.remove_egs + if args.egs_dir is not None: + # this egs_dir was not created by this experiment so we will not + # delete it + remove_egs = False + + # leave the last-two-numbered models, for diagnostic reasons. + common_train_lib.clean_nnet_dir( + args.dir, num_iters - 1, egs_dir, + preserve_model_interval=args.preserve_model_interval, + remove_egs=remove_egs) + + # do some reporting + [report, times, data] = nnet3_log_parse.generate_acc_logprob_report( + args.dir, "log-probability") + if args.email is not None: + common_lib.send_mail(report, "Update : Expt {0} : " + "complete".format(args.dir), args.email) + + with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f: + f.write(report) + + common_lib.execute_command("steps/info/chain_dir_info.pl " + "{0}".format(args.dir)) + + +def main(): + [args, run_opts] = get_args() + try: + train(args, run_opts) + common_lib.wait_for_background_commands() + except BaseException as e: + # look for BaseException so we catch KeyboardInterrupt, which is + # what we get when a background thread dies. + if args.email is not None: + message = ("Training session for experiment {dir} " + "died due to an error.".format(dir=args.dir)) + common_lib.send_mail(message, message, args.email) + if not isinstance(e, KeyboardInterrupt): + traceback.print_exc() + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index 5b8374a5a1d..1bf4605de94 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -29,6 +29,8 @@ extra_left_context_initial=-1 extra_right_context_final=-1 online_ivector_dir= minimize=false +determinize_opts= +write_compact=true # End configuration section. echo "$0 $@" # Print the command line for logging @@ -91,10 +93,17 @@ if [ ! -z "$online_ivector_dir" ]; then ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" fi +extra_opts= +lats_wspecifier="ark:|" +if ! $write_compact; then + extra_opts="--determinize-lattice=false" + lats_wspecifier="ark:| lattice-determinize-phone-pruned-non-compact --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize $determinize_opts $model ark:- ark:- |" +fi + if [ "$post_decode_acwt" == 1.0 ]; then - lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz" + lats_wspecifier="$lats_wspecifier gzip -c >$dir/lat.JOB.gz" else - lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" + lats_wspecifier="$lats_wspecifier lattice-scale --acoustic-scale=$post_decode_acwt --write-compact=$write_compact ark:- ark:- | gzip -c >$dir/lat.JOB.gz" fi frame_subsampling_opt= @@ -113,8 +122,9 @@ if [ $stage -le 1 ]; then --extra-right-context-final=$extra_right_context_final \ --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ - --word-symbol-table=$graphdir/words.txt "$model" \ - $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1; + --word-symbol-table=$graphdir/words.txt ${extra_opts} \ + "$model" \ + $graphdir/HCLG.fst "$feats" "$lats_wspecifier" || exit 1; fi diff --git a/egs/wsj/s5/steps/nnet3/decode_semisup.sh b/egs/wsj/s5/steps/nnet3/decode_semisup.sh index 25ce232b2c6..0e3bb4c38c0 100755 --- a/egs/wsj/s5/steps/nnet3/decode_semisup.sh +++ b/egs/wsj/s5/steps/nnet3/decode_semisup.sh @@ -8,6 +8,8 @@ # Begin configuration section. stage=1 nj=4 # number of decoding jobs. +sub_split=1 +keep_subsplit=false acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the # regular scoring script works. @@ -108,7 +110,7 @@ extra_opts= lat_wspecifier="ark:|" if ! $write_compact; then extra_opts="--determinize-lattice=false" - lat_wspecifier="ark:| lattice-determinize-phone-pruned --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize --word-determinize=$word_determinize --write-compact=false $model ark:- ark:- |" + lat_wspecifier="ark:| lattice-determinize-phone-pruned-parallel --num-threads=$num_threads --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize --word-determinize=$word_determinize --write-compact=false $model ark:- ark:- |" fi if [ "$post_decode_acwt" == 1.0 ]; then @@ -123,22 +125,99 @@ if [ -f $srcdir/frame_subsampling_factor ]; then frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" fi +# if this job is interrupted by the user, we want any background jobs to be +# killed too. +cleanup() { + local pids=$(jobs -pr) + [ -n "$pids" ] && kill $pids +} +trap "cleanup" INT QUIT TERM EXIT + # Copy the model as it is required when generating egs cp $model $dir/ || exit 1 if [ $stage -le 1 ]; then - $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \ - nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ - --frames-per-chunk=$frames_per_chunk \ - --extra-left-context=$extra_left_context \ - --extra-right-context=$extra_right_context \ - --extra-left-context-initial=$extra_left_context_initial \ - --extra-right-context-final=$extra_right_context_final \ - --minimize=$minimize --word-determinize=$word_determinize \ - --max-active=$max_active --min-active=$min_active --beam=$beam \ - --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ - --word-symbol-table=$graphdir/words.txt ${extra_opts} "$model" \ - $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1; + if [ $sub_split -eq 1 ]; then + $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \ + nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=$minimize --word-determinize=$word_determinize \ + --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt ${extra_opts} $model \ + $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1; + else + # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim + # to have at most two jobs running at each time. The idea is that if we have + # stragglers from one job, we can be processing another one at the same time. + rm $dir/.error 2>/dev/null + + prev_pid= + for n in $(seq $[nj+1]); do + lat_subset_wspecifier="ark:|" + if ! $write_compact; then + lat_subset_wspecifier="ark:| lattice-determinize-phone-pruned-parallel --num-threads=$num_threads --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize --word-determinize=$word_determinize --write-compact=false $model ark:- ark:- |" + fi + if [ "$post_decode_acwt" == 1.0 ]; then + lat_subset_wspecifier="$lat_subset_wspecifier gzip -c >$dir/lat.$n.JOB.gz" + else + lat_subset_wspecifier="$lat_subset_wspecifier lattice-scale --acoustic-scale=$post_decode_acwt --write-compact=$write_compact ark:- ark:- | gzip -c >$dir/lat.$n.JOB.gz" + fi + + if [ $n -gt $nj ]; then + this_pid= + elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $model ]; then + echo "$0: Not processing subset $n as already done (delete $dir/.done.$n if not)"; + this_pid= + else + sdata2=$data/split$nj/$n/split${sub_split}utt; + utils/split_data.sh --per-utt $sdata/$n $sub_split || exit 1; + mkdir -p $dir/log/$n + mkdir -p $dir/part + feats_subset=$(echo $feats | sed s:JOB/:$n/split${sub_split}utt/JOB/:g) + $cmd --num-threads $num_threads JOB=1:$sub_split $dir/log/$n/decode.JOB.log \ + nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=$minimize --word-determinize=$word_determinize \ + --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt ${extra_opts} $model \ + $graphdir/HCLG.fst "$feats_subset" "$lat_subset_wspecifier" || touch $dir/.error & + this_pid=$! + fi + if [ ! -z "$prev_pid" ]; then # Wait for the previous job to merge lattices. + wait $prev_pid + [ -f $dir/.error ] && \ + echo "$0: error generating lattices" && exit 1; + + if ! $keep_subsplit; then + rm $dir/.merge_error 2>/dev/null + echo "$0: Merging archives for data subset $prev_n" + for k in $(seq $sub_split); do + gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error; + done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error; + [ -f $dir/.merge_error ] && \ + echo "$0: Merging lattices for subset $prev_n failed" && exit 1; + rm $dir/lat.$prev_n.*.gz + fi + touch $dir/.done.$prev_n + fi + prev_n=$n + prev_pid=$this_pid + done + fi +fi + +if $keep_subsplit; then + echo $sub_split > $dir/sub_split fi if [ $stage -le 2 ]; then diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh index 8098b59c4ad..180d396d156 100755 --- a/egs/wsj/s5/steps/nnet3/get_degs.sh +++ b/egs/wsj/s5/steps/nnet3/get_degs.sh @@ -140,9 +140,11 @@ echo "$0: feature type is raw" cmvn_opts=$(cat $srcdir/cmvn_opts) || exit 1 + feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" -cp $srcdir/{splice_opts,cmvn_opts} $dir 2>/dev/null || true +cp $srcdir/{tree,cmvn_opts} $dir || exit 1 +cp $srcdir/splice_opts $dir 2>/dev/null || true ## set iVector options if [ ! -z "$online_ivector_dir" ]; then @@ -307,7 +309,7 @@ fi # set the command to determinize lattices, if specified. if $determinize_before_split; then - lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=true --prune=true --beam=$lattice_beam ark:- ark:-" + lattice_determinize_cmd="lattice-determinize-phone-pruned-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=true --beam=$lattice_beam $dir/final.mdl ark:- ark:-" else lattice_determinize_cmd="cat" fi diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh index 2888f77ed59..bd1cdde851d 100755 --- a/egs/wsj/s5/steps/nnet3/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/get_egs.sh @@ -380,7 +380,7 @@ if [ $stage -le 5 ]; then #concatenate egs.JOB.scp in single egs.scp rm $dir/egs.scp 2> /dev/null || true for j in $(seq $num_archives_intermediate); do - for y in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do cat $dir/egs.$j.$y.scp || exit 1; done done > $dir/egs.scp || exit 1; diff --git a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh index cdf55ea81d3..1def1548215 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh +++ b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh @@ -24,6 +24,11 @@ block_size=256 # This is the number of consecutive egs that we take fro # access. lang2weight= # array of weights one per input languge to scale example's output # w.r.t its input language during training. +lang2num_copies= # comma-separated list of number of copies per + # input language + # This is another way to scale the effect of + # a langauge especially when the language has + # relatively very little data. stage=0 echo "$0 $@" # Print the command line for logging @@ -63,6 +68,15 @@ if [ ${#args[@]} != $[$num_langs+1] ]; then exit 1; fi +num_copies_per_lang= +if [ ! -z "$lang2num_copies" ]; then + IFS=, read -r -a num_copies_per_lang <<< $lang2num_copies + if [ ${#num_copies_per_lang[@]} -ne $num_langs ]; then + echo "$0: --lang2num-copies must be an array of num-langs=$num_langs integers" + exit 1 + fi +fi + required="egs.scp combine.scp train_diagnostic.scp valid_diagnostic.scp" train_scp_list= train_diagnostic_scp_list= @@ -87,12 +101,45 @@ for lang in $(seq 0 $[$num_langs-1]);do echo "$0: no such file ${multi_egs_dir[$lang]}/$f." && exit 1; fi done + + if [ -z "$lang2num_copies" ] || [ ${num_copies_per_lang[$lang]} -eq 1 ]; then + train_scp_list="$train_scp_list ${multi_egs_dir[$lang]}/egs.scp" + train_diagnostic_scp_list="$train_diagnostic_scp_list ${multi_egs_dir[$lang]}/train_diagnostic.scp" + valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${multi_egs_dir[$lang]}/valid_diagnostic.scp" + combine_scp_list="$combine_scp_list ${multi_egs_dir[$lang]}/combine.scp" + else + rm -f $megs_dir/lang${lang}_egs.scp $megs_dir/lang${lang}_train_diagnostic.scp \ + $megs_dir/lang${lang}_valid_diagnostic.scp $megs_dir/lang${lang}_combine.scp + + if [ $(perl -e "{print int(${num_copies_per_lang[$lang]})}") != ${num_copies_per_lang[$lang]} ]; then + echo "$0: Expected --lang2num-copies to have only integers; " + echo "$0: got ${num_copies_per_lang[$lang]} for language $lang" + exit 1 + fi + + for i in `seq ${num_copies_per_lang[$lang]}`; do + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/egs.scp >> \ + $megs_dir/lang${lang}_egs.scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/train_diagnostic.scp >> \ + $megs_dir/lang${lang}_train_diagnostic.scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/valid_diagnostic.scp >> \ + $megs_dir/lang${lang}_valid_diagnostic.scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/combine.scp >> \ + $megs_dir/lang${lang}_combine.scp + done + + if [ $(head -n1 $megs_dir/lang${lang}_egs.scp | wc -w) -ne 2 ]; then + echo "$0: Incorrect format in $megs_dir/lang${lang}_egs.scp; something went wrong!" + exit 1 + fi + + train_scp_list="$train_scp_list $megs_dir/lang${lang}_egs.scp" + train_diagnostic_scp_list="$train_diagnostic_scp_list $megs_dir/lang${lang}_train_diagnostic.scp" + valid_diagnostic_scp_list="$valid_diagnostic_scp_list $megs_dir/lang${lang}_valid_diagnostic.scp" + combine_scp_list="$combine_scp_list $megs_dir/lang${lang}_combine.scp" + fi num_archives=$(cat ${multi_egs_dir[$lang]}/info/num_archives) tot_num_archives=$[tot_num_archives+num_archives] - train_scp_list="$train_scp_list ${args[$lang]}/egs.scp" - train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.scp" - valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp" - combine_scp_list="$combine_scp_list ${args[$lang]}/combine.scp" # check parameter dimension to be the same in all egs dirs for f in $check_params; do diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py index 93cbc940c33..bb670d77135 100755 --- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py +++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py @@ -163,7 +163,8 @@ def latex_compliant_name(name_string): def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy', file_basename='accuracy', comparison_dir=None, - start_iter=1, latex_report=None, output_name='output'): + start_iter=1, latex_report=None, output_name='output', + get_smbr_objf=False): assert start_iter >= 1 @@ -176,7 +177,9 @@ def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy', index = 0 for dir in dirs: [report, times, data] = log_parse.generate_acc_logprob_report(dir, key, - output_name) + output_name, + get_smbr_objf=get_smbr_objf) + if index == 0: # this is the main experiment directory with open("{0}/{1}.log".format(output_dir, @@ -187,9 +190,14 @@ def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy', color_val = g_plot_colors[index] data = np.array(data) if data.shape[0] == 0: - logger.warning("Couldn't find any rows for the" - "accuracy/log-probability plot, not generating it") - return + logger.warning("Couldn't find any data for the" + "%s plot of output '%s' " + "for %s, " + "not generating it", + "smbr" if get_smbr_objf else key, + output_name, dir) + + continue data = data[data[:, 0] >= start_iter, :] plot_handle, = plt.plot(data[:, 0], data[:, 1], color=color_val, linestyle="--", @@ -775,6 +783,18 @@ def generate_plots(exp_dir, output_dir, output_names, comparison_dir=None, key='log-probability', file_basename='log_probability', comparison_dir=comparison_dir, start_iter=start_iter, latex_report=latex_report, output_name=output_name) + elif objective_type == "chain-smbr": + generate_acc_logprob_plots( + exp_dir, output_dir, g_plot, + key='log-probability', file_basename='log_probability', + comparison_dir=comparison_dir, start_iter=start_iter, + latex_report=latex_report, output_name=output_name) + generate_acc_logprob_plots( + exp_dir, output_dir, g_plot, + key='log-probability', file_basename='smbr', + comparison_dir=comparison_dir, start_iter=start_iter, + latex_report=latex_report, output_name=output_name, + get_smbr_objf=True) elif objective_type == "rnnlm_objective": logger.info("Generating RNNLM objective plots") generate_acc_logprob_plots( diff --git a/egs/wsj/s5/steps/nnet3/train_discriminative.sh b/egs/wsj/s5/steps/nnet3/train_discriminative.sh index 028a22d6bc2..213df2d76e9 100755 --- a/egs/wsj/s5/steps/nnet3/train_discriminative.sh +++ b/egs/wsj/s5/steps/nnet3/train_discriminative.sh @@ -54,6 +54,7 @@ cleanup=true keep_model_iters=100 remove_egs=false src_model= # will default to $degs_dir/final.mdl +adjust_priors=true # Set it to false for 'chain' models num_jobs_compute_prior=10 @@ -330,18 +331,19 @@ while [ $x -lt $num_iters ]; do e=${iter_to_epoch[$x]} ln -sf $x.mdl $dir/epoch$e.mdl - ( - rm $dir/.error 2> /dev/null - - steps/nnet3/adjust_priors.sh --egs-type degs \ - --num-jobs-compute-prior $num_jobs_compute_prior \ - --cmd "$cmd" --use-gpu false \ - --minibatch-size $minibatch_size \ - --use-raw-nnet false --iter epoch$e $dir $degs_dir \ - || { touch $dir/.error; echo "Error in adjusting priors. See errors above."; exit 1; } - ) & + if $adjust_priors; then + ( + rm $dir/.error 2> /dev/null + + steps/nnet3/adjust_priors.sh --egs-type degs \ + --num-jobs-compute-prior $num_jobs_compute_prior \ + --cmd "$cmd" --use-gpu false \ + --minibatch-size $minibatch_size \ + --use-raw-nnet false --iter epoch$e $dir $degs_dir \ + || { touch $dir/.error; echo "Error in adjusting priors. See errors above."; exit 1; } + ) & + fi fi - done rm $dir/final.mdl 2>/dev/null diff --git a/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh b/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh index 9988c941441..6651a744e4d 100755 --- a/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh +++ b/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh @@ -16,6 +16,7 @@ word_ins_penalty=0.0,0.5,1.0 min_lmwt=7 max_lmwt=17 iter=final +scoring_affix=_kaldi #end configuration section. echo "$0 $@" # Print the command line for logging @@ -59,15 +60,14 @@ else fi -mkdir -p $dir/scoring_kaldi -cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1; -if [ $stage -le 0 ]; then - - for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do - mkdir -p $dir/scoring_kaldi/penalty_$wip/log +mkdir -p $dir/scoring${scoring_affix} +cat $data/text | $ref_filtering_cmd > $dir/scoring${scoring_affix}/test_filt.txt || exit 1; +for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + mkdir -p $dir/scoring${scoring_affix}/penalty_$wip/log + if [ $stage -le 0 ]; then if $decode_mbr ; then - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring${scoring_affix}/penalty_$wip/log/best_path.LMWT.log \ acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ @@ -75,37 +75,38 @@ if [ $stage -le 0 ]; then lattice-mbr-decode --word-symbol-table=$symtab \ ark:- ark,t:- \| \ utils/int2sym.pl -f 2- $symtab \| \ - $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + $hyp_filtering_cmd '>' $dir/scoring${scoring_affix}/penalty_$wip/LMWT.txt || exit 1; else - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring${scoring_affix}/penalty_$wip/log/best_path.LMWT.log \ lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \ utils/int2sym.pl -f 2- $symtab \| \ - $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + $hyp_filtering_cmd '>' $dir/scoring${scoring_affix}/penalty_$wip/LMWT.txt || exit 1; fi + fi - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \ - cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \ - compute-wer --text --mode=present \ - ark:$dir/scoring_kaldi/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; - - done -fi + if [ $stage -le 1 ]; then + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring${scoring_affix}/penalty_$wip/log/score.LMWT.log \ + cat $dir/scoring${scoring_affix}/penalty_$wip/LMWT.txt \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring${scoring_affix}/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; + fi +done -if [ $stage -le 1 ]; then +if [ $stage -le 2 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do for lmwt in $(seq $min_lmwt $max_lmwt); do # adding /dev/null to the command list below forces grep to output the filename grep WER $dir/wer_${lmwt}_${wip} /dev/null done - done | utils/best_wer.sh >& $dir/scoring_kaldi/best_wer || exit 1 + done | utils/best_wer.sh >& $dir/scoring${scoring_affix}/best_wer || exit 1 - best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer) + best_wer_file=$(awk '{print $NF}' $dir/scoring${scoring_affix}/best_wer) best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}') best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}') @@ -115,25 +116,25 @@ if [ $stage -le 1 ]; then fi if $stats; then - mkdir -p $dir/scoring_kaldi/wer_details - echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight - echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty - - $cmd $dir/scoring_kaldi/log/stats1.log \ - cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ - align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ - utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\ - utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; - - $cmd $dir/scoring_kaldi/log/stats2.log \ - cat $dir/scoring_kaldi/wer_details/per_utt \| \ + mkdir -p $dir/scoring${scoring_affix}/wer_details + echo $best_lmwt > $dir/scoring${scoring_affix}/wer_details/lmwt # record best language model weight + echo $best_wip > $dir/scoring${scoring_affix}/wer_details/wip # record best word insertion penalty + + $cmd $dir/scoring${scoring_affix}/log/stats1.log \ + cat $dir/scoring${scoring_affix}/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$dir/scoring${scoring_affix}/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring${scoring_affix}/wer_details/per_utt \|\ + utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring${scoring_affix}/wer_details/per_spk || exit 1; + + $cmd $dir/scoring${scoring_affix}/log/stats2.log \ + cat $dir/scoring${scoring_affix}/wer_details/per_utt \| \ utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ - sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring${scoring_affix}/wer_details/ops || exit 1; - $cmd $dir/scoring_kaldi/log/wer_bootci.log \ + $cmd $dir/scoring${scoring_affix}/log/wer_bootci.log \ compute-wer-bootci --mode=present \ - ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ - '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; + ark:$dir/scoring${scoring_affix}/test_filt.txt ark:$dir/scoring${scoring_affix}/penalty_$best_wip/$best_lmwt.txt \ + '>' $dir/scoring${scoring_affix}/wer_details/wer_bootci || exit 1; fi fi diff --git a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh index 60e3df20df2..bfb74cb475e 100755 --- a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh +++ b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#f!/bin/bash # Copyright 2016-17 Vimal Manohar # 2017 Nagendra Kumar Goel @@ -99,14 +99,14 @@ data_id=`basename $data_dir` sad_dir=${dir}/${sad_name}${affix}_${data_id}_whole${feat_affix} seg_dir=${dir}/${segmentation_name}${affix}_${data_id}_whole${feat_affix} -test_data_dir=data/${data_id}${feat_affix}_hires - if $convert_data_dir_to_whole; then + test_data_dir=data/${data_id}_whole${feat_affix}_hires if [ $stage -le 0 ]; then rm -r ${test_data_dir} || true utils/data/convert_data_dir_to_whole.sh $src_data_dir ${test_data_dir} fi else + test_data_dir=data/${data_id}${feat_affix}_hires if [ $stage -le 0 ]; then rm -r ${test_data_dir} || true utils/copy_data_dir.sh $src_data_dir $test_data_dir @@ -170,7 +170,8 @@ fi ## Prepare FST we search to make speech/silence decisions. ############################################################################### -frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) +utils/data/get_utt2dur.sh --nj $nj --cmd "$cmd" $test_data_dir || exit 1 +frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) || exit 1 graph_dir=${dir}/graph_${output_name} if [ $stage -le 5 ]; then diff --git a/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py b/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py old mode 100644 new mode 100755 index de263c6923f..48d081058c4 --- a/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py +++ b/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py @@ -14,7 +14,7 @@ # limitations under the License. # ============================================================================== -# this script trains a vanilla RNNLM with TensorFlow. +# this script trains a vanilla RNNLM with TensorFlow. # to call the script, do # python steps/tfrnnlm/vanilla_rnnlm.py --data-path=$datadir \ # --save-path=$savepath --vocab-path=$rnn.wordlist [--hidden-size=$size] @@ -39,6 +39,7 @@ logging = tf.logging flags.DEFINE_integer("hidden-size", 200, "hidden dim of RNN") +flags.DEFINE_integer("max-max-epoch", 20, "maximum number of epochs") flags.DEFINE_string("data-path", None, "Where the training/test data is stored.") @@ -120,7 +121,7 @@ def attn_cell(): test_word_in = tf.placeholder(tf.int32, [1, 1], name="test_word_in") state_placeholder = tf.placeholder(tf.float32, [config.num_layers, 1, size], name="test_state_in") - # unpacking the input state context + # unpacking the input state context l = tf.unstack(state_placeholder, axis=0) test_input_state = tuple( [l[idx] for idx in range(config.num_layers)] @@ -281,6 +282,7 @@ def main(_): config = get_config() config.hidden_size = FLAGS.hidden_size + config.max_max_epoch = FLAGS.max_max_epoch config.vocab_size = len(word_map) eval_config = get_config() eval_config.batch_size = 1 diff --git a/egs/wsj/s5/utils/data/get_reco2dur.sh b/egs/wsj/s5/utils/data/get_reco2dur.sh index 943e739c53c..86225e07ede 100755 --- a/egs/wsj/s5/utils/data/get_reco2dur.sh +++ b/egs/wsj/s5/utils/data/get_reco2dur.sh @@ -88,7 +88,7 @@ elif [ -f $data/wav.scp ]; then fi read_entire_file=false - if grep -q 'sox.*speed' $data/wav.scp; then + if [ $(utils/data/internal/should_read_entire_wavefile.pl $data/wav.scp) == "true" ]; then read_entire_file=true echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow." echo "... It is much faster if you call get_reco2dur.sh *before* doing the speed perturbation via e.g. perturb_data_dir_speed.sh or " @@ -115,8 +115,7 @@ elif [ -f $data/wav.scp ]; then $cmd JOB=1:$nj $data/log/get_reco_durations.JOB.log \ wav-to-duration --read-entire-file=$read_entire_file \ - scp:$temp_data_dir/JOB/wav.scp ark,t:$temp_data_dir/JOB/reco2dur || \ - { echo "$0: there was a problem getting the durations"; exit 1; } # This could + scp,p:$temp_data_dir/JOB/wav.scp ark,t:$temp_data_dir/JOB/reco2dur || exit 1 for n in `seq $nj`; do cat $temp_data_dir/$n/reco2dur diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh index 995136a5575..ece7ea968c7 100755 --- a/egs/wsj/s5/utils/data/get_utt2dur.sh +++ b/egs/wsj/s5/utils/data/get_utt2dur.sh @@ -39,7 +39,31 @@ fi if [ -s $data/segments ]; then echo "$0: working out $data/utt2dur from $data/segments" - awk '{len=$4-$3; print $1, len;}' < $data/segments > $data/utt2dur + cat $data/segments | awk '{if ($4 != -1) { len=$4-$3; print $1, len;} else { print $1, "LENGTH_NOT_FOUND"; } }' > $data/utt2dur + + if [ $(grep LENGTH_NOT_FOUND $data/utt2dur | wc -l) -ne 0 ]; then + utils/data/get_reco2dur.sh --cmd "$cmd" $data + + cat $data/segments | python3 -c "import sys +reco2dur = {} +for line in open('$data/reco2dur').readlines(): + parts = line.strip().split() + reco2dur[parts[0]] = float(parts[1]) + +for line in sys.stdin.readlines(): + parts = line.strip().split() + st = float(parts[2]) + end = float(parts[3]) + if end == -1: + if parts[1] not in reco2dur: + print ('Could not find reco {} in $data/reco2dur'.format(parts[1]), + file=sys.stderr) + sys.exit(1) + len = reco2dur[parts[1]] - st + else: + len = end - st + print ('{} {}'.format(parts[0], len))" > $data/utt2dur || exit 1 + fi elif [ -f $data/wav.scp ]; then echo "$0: segments file does not exist so getting durations from wave files" @@ -75,14 +99,13 @@ elif [ -f $data/wav.scp ]; then fi read_entire_file=false - if grep -q 'sox.*speed' $data/wav.scp; then + if [ $(utils/data/internal/should_read_entire_wavefile.pl $data/wav.scp) == "true" ]; then read_entire_file=true echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow." echo "... It is much faster if you call get_utt2dur.sh *before* doing the speed perturbation via e.g. perturb_data_dir_speed.sh or " echo "... perturb_data_dir_speed_3way.sh." fi - num_utts=$(wc -l <$data/utt2spk) if [ $nj -gt $num_utts ]; then nj=$num_utts @@ -93,8 +116,7 @@ elif [ -f $data/wav.scp ]; then $cmd JOB=1:$nj $data/log/get_durations.JOB.log \ wav-to-duration --read-entire-file=$read_entire_file \ - scp:$sdata/JOB/wav.scp ark,t:$sdata/JOB/utt2dur || \ - { echo "$0: there was a problem getting the durations"; exit 1; } + scp,p:$sdata/JOB/wav.scp ark,t:$sdata/JOB/utt2dur || exit 1 for n in `seq $nj`; do cat $sdata/$n/utt2dur diff --git a/egs/wsj/s5/utils/data/internal/should_read_entire_wavefile.pl b/egs/wsj/s5/utils/data/internal/should_read_entire_wavefile.pl new file mode 100755 index 00000000000..04d2dd8b619 --- /dev/null +++ b/egs/wsj/s5/utils/data/internal/should_read_entire_wavefile.pl @@ -0,0 +1,10 @@ +#!/bin/perl + +while (<>) { + if (m/sox.*speed/ || m/sox.*trim/) { + print "true"; + exit(0); + } +} + +print "false"; diff --git a/egs/wsj/s5/utils/mkgraph.sh b/egs/wsj/s5/utils/mkgraph.sh index 1becfc45be3..3d06436e0a3 100755 --- a/egs/wsj/s5/utils/mkgraph.sh +++ b/egs/wsj/s5/utils/mkgraph.sh @@ -19,13 +19,15 @@ tscale=1.0 loopscale=0.1 remove_oov=false +unk_prob_scale=1.0 -for x in `seq 4`; do +for x in `seq 5`; do [ "$1" == "--mono" -o "$1" == "--left-biphone" -o "$1" == "--quinphone" ] && shift && \ echo "WARNING: the --mono, --left-biphone and --quinphone options are now deprecated and ignored." [ "$1" == "--remove-oov" ] && remove_oov=true && shift; [ "$1" == "--transition-scale" ] && tscale=$2 && shift 2; [ "$1" == "--self-loop-scale" ] && loopscale=$2 && shift 2; + [ "$1" == "--unk-prob-scale" ] && unk_prob_scale=$2 && shift 2; done if [ $# != 3 ]; then @@ -79,12 +81,26 @@ P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error echo "$0: WARNING: chain models need '--self-loop-scale 1.0'"; mkdir -p $lang/tmp + +G_fst=$lang/G.fst +if [ $unk_prob_scale != 1.0 ]; then + oov_symbol=`cat $lang/oov.int` + fstprint $lang/G.fst | \ + awk -v oov_symbol=$oov_symbol -v unk_scale=$unk_prob_scale '{ + if ($4 == oov_symbol) { + $5 = $5 - log(unk_scale); + } + print $0; + }' | fstcompile > $graph_dir/G_tmp.fst + G_fst=$graph_dir/G_tmp.fst +fi + trap "rm -f $lang/tmp/LG.fst.$$" EXIT HUP INT PIPE TERM # Note: [[ ]] is like [ ] but enables certain extra constructs, e.g. || in # place of -o -if [[ ! -s $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $lang/G.fst || \ +if [[ ! -s $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $G_fst || \ $lang/tmp/LG.fst -ot $lang/L_disambig.fst ]]; then - fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar --use-log=true | \ + fsttablecompose $lang/L_disambig.fst $G_fst | fstdeterminizestar --use-log=true | \ fstminimizeencoded | fstpushspecial | \ fstarcsort --sort_type=ilabel > $lang/tmp/LG.fst.$$ || exit 1; mv $lang/tmp/LG.fst.$$ $lang/tmp/LG.fst diff --git a/src/bin/decode-faster.cc b/src/bin/decode-faster.cc index cbcdb771d56..30c4988dcee 100644 --- a/src/bin/decode-faster.cc +++ b/src/bin/decode-faster.cc @@ -39,7 +39,11 @@ int main(int argc, char *argv[]) { const char *usage = "Decode, reading log-likelihoods (of transition-ids or whatever symbol is on the graph)\n" - "as matrices. Note: you'll usually want decode-faster-mapped rather than this program.\n" + "as matrices. " + "The matrixes are 0-indexed, while the symbol on the graph is 1-indexed. So " + "the column i of matrix corresponds to likelihood of symbol i+1 in the graph.\n" + "Note: you'll usually want decode-faster-mapped rather than this program for " + "decoding acoustic models.\n" "\n" "Usage: decode-faster [options] []\n"; ParseOptions po(usage); diff --git a/src/chain/Makefile b/src/chain/Makefile index 2a735c2ca2d..c57a9010bae 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -5,13 +5,14 @@ include ../kaldi.mk LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) -TESTFILES = chain-supervision-test language-model-test +TESTFILES = chain-supervision-test language-model-test chain-supervision-splitter-test OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \ language-model.o chain-denominator.o chain-training.o \ - chain-generic-numerator.o + chain-generic-numerator.o \ + chain-denominator-smbr.o chain-supervision-splitter.o ifeq ($(CUDA), true) - OBJFILES += chain-kernels.o + OBJFILES += chain-kernels.o chain-smbr-kernels.o endif LIBNAME = kaldi-chain diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc new file mode 100644 index 00000000000..c18154b1d9b --- /dev/null +++ b/src/chain/chain-denominator-smbr.cc @@ -0,0 +1,673 @@ +// chain/chain-denominator-smbr.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "chain/chain-denominator-smbr.h" +#include "chain/chain-kernels-ansi.h" + +namespace kaldi { +namespace chain { + +DenominatorSmbrComputation::DenominatorSmbrComputation( + const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + int32 num_sequences, + const CuMatrixBase &nnet_output, + const CuMatrixBase &numerator_posteriors): + opts_(opts), + den_graph_(den_graph), + num_sequences_(num_sequences), + frames_per_sequence_(nnet_output.NumRows() / num_sequences_), + exp_nnet_output_transposed_(nnet_output, kTrans), + numerator_posteriors_transposed_(numerator_posteriors, kTrans), + nnet_output_acc_deriv_transposed_( + exp_nnet_output_transposed_.NumRows(), + std::min(exp_nnet_output_transposed_.NumCols(), + static_cast(kMaxDerivTimeSteps) * + num_sequences_)), + nnet_output_log_prob_deriv_transposed_( + exp_nnet_output_transposed_.NumRows(), + std::min(exp_nnet_output_transposed_.NumCols(), + static_cast(kMaxDerivTimeSteps) * + num_sequences_)), + alpha_(frames_per_sequence_ + 1, + den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), + alpha_smbr_(frames_per_sequence_ + 1, + den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), + beta_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), + beta_smbr_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), + tot_prob_(num_sequences_, kUndefined), + tot_smbr_(num_sequences_, kUndefined), + tot_log_prob_(num_sequences_, kUndefined), + log_correction_term_(num_sequences_, kUndefined), + ok_(true) { + KALDI_ASSERT(opts_.leaky_hmm_coefficient >= 0.0 && + opts_.leaky_hmm_coefficient < 1.0); + + KALDI_ASSERT(opts_.smbr_leaky_hmm_coefficient < 1.0); + + if (opts_.smbr_leaky_hmm_coefficient < 0.0) + leaky_hmm_coefficient_ = opts_.leaky_hmm_coefficient; + else + leaky_hmm_coefficient_ = opts_.smbr_leaky_hmm_coefficient; + + // make sure the alpha sums and beta sums are zeroed. + alpha_.ColRange(den_graph_.NumStates() * num_sequences_, + num_sequences_).SetZero(); + beta_.ColRange(den_graph_.NumStates() * num_sequences_, + num_sequences_).SetZero(); + alpha_smbr_.ColRange(den_graph_.NumStates() * num_sequences_, + num_sequences_).SetZero(); + beta_smbr_.ColRange(den_graph_.NumStates() * num_sequences_, + num_sequences_).SetZero(); + + KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0); + exp_nnet_output_transposed_.ApplyExp(); +} + + +void DenominatorSmbrComputation::AlphaSmbrFirstFrame() { + // dim == num_hmm_states_ * num_sequences_. + BaseFloat *first_frame_alpha = alpha_.RowData(0); + // create a 'fake matrix' - view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubMatrix alpha_mat(first_frame_alpha, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + // TODO (possible): It would be more efficient here if we implemented a + // CopyColsFromVec function in class CuMatrix. + alpha_mat.SetZero(); + alpha_mat.AddVecToCols(1.0, den_graph_.InitialProbs(), 0.0); + + BaseFloat *first_frame_alpha_smbr = alpha_smbr_.RowData(0); + // create a 'fake matrix' - view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubVector alpha_smbr_vec(first_frame_alpha_smbr, + den_graph_.NumStates() + * num_sequences_); + alpha_smbr_vec.SetZero(); +} + + +// the alpha smbr computation for some 0 < t <= num_time_steps_. +void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { + KALDI_ASSERT(t > 0 && t <= frames_per_sequence_); + BaseFloat *this_alpha = alpha_.RowData(t); + BaseFloat *this_alpha_smbr = alpha_smbr_.RowData(t); + const BaseFloat *prev_alpha_dash = alpha_.RowData(t - 1); + const BaseFloat *prev_alpha_smbr = alpha_smbr_.RowData(t - 1); + const Int32Pair *backward_transitions = den_graph_.BackwardTransitions(); + const DenominatorGraphTransition *transitions = den_graph_.Transitions(); + int32 num_pdfs = exp_nnet_output_transposed_.NumRows(), + num_hmm_states = den_graph_.NumStates(), + num_sequences = num_sequences_; + + // 'probs' is the matrix of pseudo-likelihoods for frame t - 1. + CuSubMatrix probs(exp_nnet_output_transposed_, 0, num_pdfs, + (t-1) * num_sequences_, num_sequences_); + const BaseFloat *prob_data = probs.Data(); + + // 'numerator_post' is the matrix of numerator posteriors for frame t - 1. + CuSubMatrix numerator_post( + numerator_posteriors_transposed_, 0, num_pdfs, + (t-1) * num_sequences_, num_sequences_); + const BaseFloat *post_data = numerator_post.Data(); + +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + CuTimer tim; + dim3 dimBlock(std::min(CU1DBLOCK, num_sequences), 1, 1); + dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); + + while (1) { + if (dimGrid.y > 65535) // the hardware doesn't allow more than this. + dimGrid.y = 65535; + cuda_chain_smbr_hmm_forward(dimGrid, dimBlock, + backward_transitions, transitions, + num_sequences, den_graph_.NumStates(), + prob_data, probs.Stride(), + post_data, numerator_post.Stride(), + prev_alpha_dash, prev_alpha_smbr, + this_alpha, this_alpha_smbr); + CU_SAFE_CALL(cudaGetLastError()); + if (dimGrid.y == num_hmm_states) { + break; // this is the normal case. + } else { + // We reach this code only in the unusual case where num_hmm_states > + // 65535. We can compute the alphas for the remaining HMM states by + // moving some of the array pointers and making the call again. + backward_transitions += dimGrid.y; + this_alpha += dimGrid.y * num_sequences; + this_alpha_smbr += dimGrid.y * num_sequences; + num_hmm_states -= dimGrid.y; + dimGrid.y = num_hmm_states; + } + } + CuDevice::Instantiate().AccuProfile(__func__, tim); + } else +#endif + { + int32 prob_stride = probs.Stride(), + post_stride = numerator_post.Stride(); + for (int32 h = 0; h < num_hmm_states; h++) { + for (int32 s = 0; s < num_sequences; s++) { + double this_tot_alpha = 0.0; + double this_tot_alpha_smbr = 0.0; + const DenominatorGraphTransition + *trans_iter = transitions + backward_transitions[h].first, + *trans_end = transitions + backward_transitions[h].second; + for (; trans_iter != trans_end; ++trans_iter) { + BaseFloat transition_prob = trans_iter->transition_prob; + int32 pdf_id = trans_iter->pdf_id, + prev_hmm_state = trans_iter->hmm_state; + BaseFloat prob = prob_data[pdf_id * prob_stride + s], + post = post_data[pdf_id * post_stride + s], + this_prev_alpha = prev_alpha_dash[prev_hmm_state * num_sequences + s], + this_prev_alpha_smbr = prev_alpha_smbr[prev_hmm_state * num_sequences + s]; + this_tot_alpha += this_prev_alpha * transition_prob * prob; + KALDI_ASSERT(post > -1e-20); + this_tot_alpha_smbr += + (this_prev_alpha_smbr + post) + * this_prev_alpha * transition_prob * prob; + } + // Let arbitrary_scale be the inverse of the alpha-sum value that we + // store in the same place we'd store the alpha for the state numbered + // 'num_hmm_states'. We multiply this into all the + // transition-probabilities from the previous frame to this frame, in + // both the forward and backward passes, in order to keep the alphas in + // a good numeric range. This won't affect the posteriors, but when + // computing the total likelihood we'll need to compensate for it later + // on. + BaseFloat arbitrary_scale = + 1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s]; + KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); + KALDI_ASSERT(this_tot_alpha_smbr - this_tot_alpha_smbr == 0); + this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; + if (this_tot_alpha > 0.0) + this_alpha_smbr[h * num_sequences + s] = + this_tot_alpha_smbr / this_tot_alpha; + else + this_alpha_smbr[h * num_sequences + s] = 0.0; + } + } + } +} + +void DenominatorSmbrComputation::AlphaSmbrDash(int32 t) { + BaseFloat *this_alpha = alpha_.RowData(t); + BaseFloat *this_alpha_smbr = alpha_smbr_.RowData(t); + + // create a 'fake matrix' for the regular alphas- view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubMatrix alpha_mat(this_alpha, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + + CuSubMatrix alpha_smbr_mat(this_alpha_smbr, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + alpha_smbr_mat.MulElements(alpha_mat); + + // Compute the sum of alpha over all states i for the current time. + // This is done for each sequence and stored in the last 'num_sequences_' + // columns. + CuSubVector alpha_sum_vec(this_alpha + + den_graph_.NumStates() * num_sequences_, + num_sequences_); + alpha_sum_vec.AddRowSumMat(1.0, alpha_mat, 0.0); + + CuSubVector alpha_smbr_sum_vec( + this_alpha_smbr + den_graph_.NumStates() * num_sequences_, + num_sequences_); + alpha_smbr_sum_vec.AddRowSumMat(1.0, alpha_smbr_mat, 0.0); + + KALDI_ASSERT(alpha_sum_vec.Min() > 0); + + alpha_smbr_mat.AddVecVec(leaky_hmm_coefficient_, + den_graph_.InitialProbs(), + alpha_smbr_sum_vec); + alpha_mat.AddVecVec(leaky_hmm_coefficient_, + den_graph_.InitialProbs(), + alpha_sum_vec); + // it's now alpha-dash. + + alpha_smbr_mat.DivElements(alpha_mat); +} + +// compute beta from beta-dash. +void DenominatorSmbrComputation::BetaSmbr(int32 t) { + BaseFloat *this_beta_dash = beta_.RowData(t % 2); + BaseFloat *this_beta_smbr_dash = beta_smbr_.RowData(t % 2); + // create a 'fake matrix' for the regular beta-dash (which is + // the counterpart of alpha-dash)- view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubMatrix beta_dash_mat(this_beta_dash, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + + CuSubMatrix beta_smbr_dash_mat(this_beta_smbr_dash, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + beta_smbr_dash_mat.MulElements(beta_dash_mat); + + // making the t index implicit, the beta-dash-sum for each sequence is the sum + // over all states i of beta_i * leaky_hmm_coefficient_ * initial_prob_i. + CuSubVector beta_dash_sum_vec( + this_beta_dash + den_graph_.NumStates() * num_sequences_, + num_sequences_); + beta_dash_sum_vec.AddMatVec(leaky_hmm_coefficient_, beta_dash_mat, + kTrans, den_graph_.InitialProbs(), 0.0); + CuSubVector beta_smbr_dash_sum_vec( + this_beta_smbr_dash + den_graph_.NumStates() * num_sequences_, + num_sequences_); + beta_smbr_dash_sum_vec.AddMatVec(leaky_hmm_coefficient_, + beta_smbr_dash_mat, kTrans, + den_graph_.InitialProbs(), 0.0); + + // we are computing beta in place. After the following, beta-dash-mat + // will contain the actual beta (i.e. the counterpart of alpha), + // not the beta-dash. + beta_dash_mat.AddVecToRows(1.0, beta_dash_sum_vec); + + beta_smbr_dash_mat.AddVecToRows(1.0, beta_smbr_dash_sum_vec); + beta_smbr_dash_mat.DivElements(beta_dash_mat); +} + +BaseFloat DenominatorSmbrComputation::ForwardSmbr(BaseFloat *aux_objf) { + AlphaSmbrFirstFrame(); + AlphaSmbrDash(0); + for (int32 t = 1; t <= frames_per_sequence_; t++) { + AlphaSmbrGeneralFrame(t); + AlphaSmbrDash(t); + } + return ComputeTotObjf(aux_objf); +} + +BaseFloat DenominatorSmbrComputation::ComputeTotObjf(BaseFloat *aux_objf) { + tot_prob_.Resize(num_sequences_); + tot_smbr_.Resize(num_sequences_); + // View the last alpha-dash as a matrix of size num-hmm-states by num-sequences. + CuSubMatrix last_alpha_dash( + alpha_.RowData(frames_per_sequence_), + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + CuMatrix last_alpha_smbr(CuSubMatrix ( + alpha_smbr_.RowData(frames_per_sequence_), + den_graph_.NumStates(), + num_sequences_, + num_sequences_)); + // TODO: Make this vector multiplication + + // Sum over all the HMM states for each sequence. + tot_prob_.AddRowSumMat(1.0, last_alpha_dash, 0.0); + // we should probably add an ApplyLog() function that takes a vector argument. + tot_log_prob_ = tot_prob_; + tot_log_prob_.ApplyLog(); + BaseFloat tot_log_prob = tot_log_prob_.Sum(); + + // We now have to add something for the arbitrary scaling factor. [note: the + // purpose of the arbitrary scaling factors was to keep things in a good + // floating-point range] + // The inverses of all the tot-alpha quantities, for t = 0 + // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in + // the transition-probs, so we need to multiply them all together (not + // inversed) and add them as a correction term to the total log-likes. + // These tot-alpha quantities were stored in the same place that we would + // have stored the HMM-state numbered 'num_hmm_states'. + int32 num_hmm_states = den_graph_.NumStates(); + CuSubMatrix inv_arbitrary_scales( + alpha_, 0, frames_per_sequence_, + num_sequences_ * num_hmm_states, num_sequences_); + CuMatrix log_inv_arbitrary_scales( + inv_arbitrary_scales); + log_inv_arbitrary_scales.ApplyLog(); + BaseFloat log_inv_arbitrary_scales_product = + log_inv_arbitrary_scales.Sum(); + + BaseFloat prob_sum = tot_prob_.Sum(); + KALDI_ASSERT(prob_sum == prob_sum); + + // Take weighted-average of the SMBR quantitites over all the + // HMM states for each sequence. + last_alpha_smbr.MulElements(last_alpha_dash); + tot_smbr_.AddRowSumMat(1.0, last_alpha_smbr, 0.0); + tot_smbr_.DivElements(tot_prob_); + + if (aux_objf) + *aux_objf = -opts_.mmi_factor * ( + tot_log_prob + log_inv_arbitrary_scales_product); + return opts_.smbr_factor * tot_smbr_.Sum(); +} + + + +bool DenominatorSmbrComputation::BackwardSmbr( + BaseFloat deriv_weight, + CuMatrixBase *nnet_output_deriv) { + BetaSmbrDashLastFrame(); + BetaSmbr(frames_per_sequence_); + for (int32 t = frames_per_sequence_ - 1; t >= 0; t--) { + BetaSmbrDashGeneralFrame(t); + if (GetVerboseLevel() >= 1 || t == 0) + BetaSmbrGeneralFrameDebug(t); + BetaSmbr(t); + if (t % kMaxDerivTimeSteps == 0) { + // commit the derivative stored in exp_nnet_output_transposed_ by adding + // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'. + int32 chunk_frames = std::min(static_cast(kMaxDerivTimeSteps), + frames_per_sequence_ - t), + num_pdfs = exp_nnet_output_transposed_.NumRows(); + + CuSubMatrix output_deriv_part( + *nnet_output_deriv, + t * num_sequences_, chunk_frames * num_sequences_, + 0, num_pdfs); + + // The following is needed so that the matrix will be of the same + // dimension as output_deriv_part. + CuSubMatrix transposed_log_prob_deriv_part( + nnet_output_log_prob_deriv_transposed_, + 0, num_pdfs, + 0, chunk_frames * num_sequences_); + output_deriv_part.AddMat(-deriv_weight * opts_.mmi_factor, + transposed_log_prob_deriv_part, kTrans); + + CuSubMatrix transposed_acc_deriv_part( + nnet_output_acc_deriv_transposed_, + 0, num_pdfs, + 0, chunk_frames * num_sequences_); + output_deriv_part.AddMat(deriv_weight * opts_.smbr_factor, + transposed_acc_deriv_part, kTrans); + + if (GetVerboseLevel() >= 2) { + CuVector acc_deriv_sum(num_pdfs); + acc_deriv_sum.AddColSumMat(1.0, transposed_acc_deriv_part, 0.0); + CuVector log_prob_deriv_sum(num_pdfs); + log_prob_deriv_sum.AddColSumMat(1.0, transposed_log_prob_deriv_part, 0.0); + + CuSubMatrix transposed_num_post( + numerator_posteriors_transposed_, + 0, num_pdfs, + 0, chunk_frames * num_sequences_); + + acc_deriv_sum.Write(KALDI_LOG, false); + log_prob_deriv_sum.Write(KALDI_LOG, false); + } + + transposed_log_prob_deriv_part.MulColsGroupVec(tot_smbr_); + output_deriv_part.AddMat(-deriv_weight * opts_.smbr_factor, + transposed_log_prob_deriv_part, kTrans); + + if (t != 0) { + transposed_acc_deriv_part.SetZero(); + transposed_log_prob_deriv_part.SetZero(); + } + } + } + return ok_; +} + +void DenominatorSmbrComputation::BetaSmbrDashLastFrame() { + // sets up the beta-dash quantity on the last frame (frame == + // frames_per_sequence_). Note that the betas we use here contain a + // 1/(tot-prob) factor in order to simplify the backprop. + + int32 t = frames_per_sequence_; + BaseFloat *last_frame_beta_dash = beta_.RowData(t % 2); + + // create a 'fake matrix' - view this row as a matrix. + CuSubMatrix beta_dash_mat(last_frame_beta_dash, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + CuVector inv_tot_prob(tot_prob_); + inv_tot_prob.InvertElements(); + // the beta values at the end of the file only vary with the sequence-index, + // not with the HMM-index. We treat all states as having a final-prob of one. + beta_dash_mat.CopyRowsFromVec(inv_tot_prob); + + BaseFloat *last_frame_beta_smbr_dash = beta_smbr_.RowData(t % 2); + + CuSubVector beta_smbr_dash_vec(last_frame_beta_smbr_dash, + den_graph_.NumStates() + * num_sequences_); + beta_smbr_dash_vec.SetZero(); +} + +void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { + KALDI_ASSERT(t >= 0 && t < frames_per_sequence_); + int32 num_pdfs = exp_nnet_output_transposed_.NumRows(); + // t_wrapped gives us the time-index we use when indexing + // nnet_output_deriv_transposed_; to save memory we limit the size of the + // matrix, storing only chunks of frames at a time, and we add it to the + // non-transposed output whenever we finish a chunk. + int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps); + const BaseFloat *this_alpha_dash = alpha_.RowData(t), + *this_alpha_smbr = alpha_smbr_.RowData(t), + *next_beta = beta_.RowData((t + 1) % 2), + *next_beta_smbr = beta_smbr_.RowData((t + 1) % 2); + BaseFloat *this_beta_dash = beta_.RowData(t % 2), + *this_beta_smbr = beta_smbr_.RowData(t % 2); + const Int32Pair *forward_transitions = den_graph_.ForwardTransitions(); + const DenominatorGraphTransition *transitions = den_graph_.Transitions(); + // 'probs' is the matrix of pseudo-likelihoods for frame t. + // 'numerator_post' is the matrix of numerator posteriors for frame t. + CuSubMatrix probs(exp_nnet_output_transposed_, 0, num_pdfs, + t * num_sequences_, num_sequences_), + numerator_post(numerator_posteriors_transposed_, 0, num_pdfs, + t * num_sequences_, num_sequences_), + acc_deriv(nnet_output_acc_deriv_transposed_, 0, num_pdfs, + t_wrapped * num_sequences_, num_sequences_), + log_prob_deriv(nnet_output_log_prob_deriv_transposed_, 0, num_pdfs, + t_wrapped * num_sequences_, num_sequences_); + + int32 num_hmm_states = den_graph_.NumStates(), + num_sequences = num_sequences_; + +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + CuTimer tim; + dim3 dimBlock(std::min(CU1DBLOCK, num_sequences), 1, 1); + dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); + while (1) { + if (dimGrid.y > 65535) // the hardware doesn't allow more than this. + dimGrid.y = 65535; + cuda_chain_smbr_hmm_backward( + dimGrid, dimBlock, forward_transitions, transitions, + num_sequences, num_hmm_states, + probs.Data(), probs.Stride(), + numerator_post.Data(), numerator_post.Stride(), + tot_smbr_.Data(), + this_alpha_dash, this_alpha_smbr, + next_beta, next_beta_smbr, + this_beta_dash, this_beta_smbr, + acc_deriv.Data(), acc_deriv.Stride(), + log_prob_deriv.Data(), log_prob_deriv.Stride()); + CU_SAFE_CALL(cudaGetLastError()); + if (dimGrid.y == num_hmm_states) { + break; // this is the normal case. + } else { + // We reach this code only in the unusual case where num_hmm_states > + // 65535. We can compute the betas (and log-prob derivatives) for the + // remaining HMM states by moving some of the array pointers and making + // the call again. + forward_transitions += dimGrid.y; + this_alpha_dash += dimGrid.y * num_sequences; + this_beta_dash += dimGrid.y * num_sequences; + num_hmm_states -= dimGrid.y; + dimGrid.y = num_hmm_states; + } + } + CuDevice::Instantiate().AccuProfile(__func__, tim); + } else +#endif + { + int32 prob_stride = probs.Stride(), + post_stride = numerator_post.Stride(), + acc_deriv_stride = acc_deriv.Stride(), + log_prob_deriv_stride = log_prob_deriv.Stride(); + const BaseFloat *prob_data = probs.Data(); + const BaseFloat *post_data = numerator_post.Data(); + BaseFloat *acc_deriv_data = acc_deriv.Data(); + BaseFloat *log_prob_deriv_data = log_prob_deriv.Data(); + for (int32 h = 0; h < num_hmm_states; h++) { + for (int32 s = 0; s < num_sequences; s++) { + BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s], + this_alpha_smbr_i = this_alpha_smbr[h * num_sequences + s], + inv_arbitrary_scale = + this_alpha_dash[num_hmm_states * num_sequences + s]; + double tot_variable_factor = 0.0, tot_beta_smbr = 0.0; + BaseFloat occupation_factor = this_alpha_dash_prob / + inv_arbitrary_scale; + const DenominatorGraphTransition + *trans_iter = transitions + forward_transitions[h].first, + *trans_end = transitions + forward_transitions[h].second; + for (; trans_iter != trans_end; ++trans_iter) { + BaseFloat transition_prob = trans_iter->transition_prob; + int32 pdf_id = trans_iter->pdf_id, + next_hmm_state = trans_iter->hmm_state; + BaseFloat next_beta_j = next_beta[next_hmm_state * num_sequences + s], + next_beta_smbr_j = next_beta_smbr[next_hmm_state * num_sequences + s]; + BaseFloat prob = prob_data[pdf_id * prob_stride + s], + post = post_data[pdf_id * post_stride + s], + variable_factor = transition_prob * next_beta_j * prob; + tot_beta_smbr += (next_beta_smbr_j + post) * variable_factor; + tot_variable_factor += variable_factor; + BaseFloat occupation_prob = occupation_factor * variable_factor; + BaseFloat this_acc_r = occupation_prob * + (this_alpha_smbr_i + post + next_beta_smbr_j); + acc_deriv_data[pdf_id * acc_deriv_stride + s] += + this_acc_r; + log_prob_deriv_data[pdf_id * log_prob_deriv_stride + s] += + occupation_prob; + } + this_beta_dash[h * num_sequences + s] = + tot_variable_factor / inv_arbitrary_scale; + if (tot_variable_factor > 0.0) + this_beta_smbr[h * num_sequences + s] = + tot_beta_smbr / tot_variable_factor; + else + this_beta_smbr[h * num_sequences + s] = 0.0; + } + } + } +} + +void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { + BaseFloat num_hmm_states = den_graph_.NumStates(), + alpha_beta_size = num_hmm_states * num_sequences_; + CuSubVector this_alpha_dash(alpha_.RowData(t), alpha_beta_size), + this_beta_dash(beta_.RowData(t % 2), alpha_beta_size), + this_alpha_smbr(alpha_smbr_.RowData(t), alpha_beta_size), + this_beta_smbr(beta_smbr_.RowData(t % 2), alpha_beta_size); + int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps), + num_pdfs = exp_nnet_output_transposed_.NumRows(); + BaseFloat alpha_beta_product = VecVec(this_alpha_dash, + this_beta_dash); + if (!ApproxEqual(alpha_beta_product, num_sequences_)) { + KALDI_WARN << "On time " << t << ", alpha-beta product " + << alpha_beta_product << " != " << num_sequences_ + << " alpha-dash-sum = " << this_alpha_dash.Sum() + << ", beta-dash-sum = " << this_beta_dash.Sum(); + if (fabs(alpha_beta_product - num_sequences_) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } else { + KALDI_VLOG(1) << "On time " << t << ", alpha-beta product = " + << alpha_beta_product + << ", alpha-dash-sum = " << this_alpha_dash.Sum() + << ", beta-dash-sum = " << this_beta_dash.Sum(); + } + + // alpha_smbr_vec is a vector of size 'num_hmm_states' * 'num_sequences_' + CuVector alpha_beta_smbr_vec(this_beta_smbr); + alpha_beta_smbr_vec.AddVec(1.0, this_alpha_smbr, 1.0); + + CuVector alpha_beta_vec(this_alpha_dash); + alpha_beta_vec.MulElements(this_beta_dash); + + alpha_beta_smbr_vec.MulElements(alpha_beta_vec); + + BaseFloat alpha_beta_smbr_sum = alpha_beta_smbr_vec.Sum() + / alpha_beta_product * num_sequences_, + tot_smbr_sum = tot_smbr_.Sum(); + KALDI_ASSERT (alpha_beta_smbr_sum - alpha_beta_smbr_sum == 0.0); + if (!ApproxEqual(tot_smbr_sum, alpha_beta_smbr_sum, 0.01)) { + KALDI_WARN << "On time " << t << ", alpha-beta-smbr " + << alpha_beta_smbr_sum << " != " << tot_smbr_sum; + } else { + KALDI_VLOG(1) << "On time " << t << ", alpha-beta-smbr " + << alpha_beta_smbr_sum << " = tot-smbr-sum"; + } + + { + CuSubMatrix this_log_prob_deriv( + nnet_output_log_prob_deriv_transposed_, 0, num_pdfs, + t_wrapped * num_sequences_, num_sequences_); + BaseFloat this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); + // use higher tolerance, since we are using randomized pruning for the + // log-prob derivatives. + if (GetVerboseLevel() > 1 || !ApproxEqual( + this_log_prob_deriv_sum, num_sequences_, 0.01)) { + KALDI_WARN << "On time " << t << ", log-prob-deriv sum " + << this_log_prob_deriv_sum << " != " + << num_sequences_; + if (fabs(this_log_prob_deriv_sum - num_sequences_) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } + } + + { + BaseFloat tot_smbr = tot_smbr_.Sum(); + + CuSubMatrix this_acc_deriv( + nnet_output_acc_deriv_transposed_, 0, num_pdfs, + t_wrapped * num_sequences_, num_sequences_); + BaseFloat this_acc_deriv_sum = this_acc_deriv.Sum(); + // use higher tolerance, since we are using randomized pruning for the + // log-prob derivatives. + if (GetVerboseLevel() > 1 || !ApproxEqual( + this_acc_deriv_sum, tot_smbr, 0.01)) { + KALDI_WARN << "On time " << t << ", acc-deriv sum " + << this_acc_deriv_sum << " != " + << tot_smbr; + if (fabs(this_acc_deriv_sum - tot_smbr) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } + } +} + + +} // namespace chain +} // namespace kaldi diff --git a/src/chain/chain-denominator-smbr.h b/src/chain/chain-denominator-smbr.h new file mode 100644 index 00000000000..fce3114521d --- /dev/null +++ b/src/chain/chain-denominator-smbr.h @@ -0,0 +1,358 @@ +// chain/chain-denominator-smbr.h + +// Copyright 2015 Johns Hopkins University (Author: Daniel Povey) +// 2016 Vimal Manohar + + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_CHAIN_CHAIN_DENOMINATOR_SMBR_H_ +#define KALDI_CHAIN_CHAIN_DENOMINATOR_SMBR_H_ + +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "tree/context-dep.h" +#include "lat/kaldi-lattice.h" +#include "matrix/kaldi-matrix.h" +#include "hmm/transition-model.h" +#include "cudamatrix/cu-matrix.h" +#include "cudamatrix/cu-array.h" +#include "chain/chain-den-graph.h" +#include "chain/chain-training.h" + +namespace kaldi { +namespace chain { + + +/* + This extended comment describes how we implement forward-backward without log + and without overflow, and also the leaky-HMM idea. + + We'll start by establishing the notation for conventional forward-backward, + then add the 'arbitrary-scale' concept that prevents overflow, and then + add the 'leaky-hmm' concept. + + All this is done in parallel over multiple sequences, but the computations + are independent over the separate sequences, so we won't introduce any notation + or index for the sequence; we'll just explain it for one sequences. + + Suppose we have I hmm-states, numbered i = 0 ... I-1 (we'll use i and j for + hmm-state indexes). Let foll(i) give a list of arcs leaving state i, and + pred(i) give a list of arcs entering state i, and we'll use notation like: + for (j, p, n) in foll(i): + for iterating over those arcs, where in this case j is the destination-state, + p is the transition-probability of the arc and n is the pdf-id index. + We can then look up the emission probability as x(t, n) for some frame + 0 <= t < T. + + ** Version 1 of the computation (naive version) ** + + * Forward computation (version 1) + + In the forward computation we're computing alpha(i, t) and alpha_r(i, t) + for 0 <= t <= T): + - For the first frame, set alpha(0, i) = init(i), where init(i) is the + initial-probabilitiy from state i. # in our framework these are obtained + # by running the HMM for a while and getting an averaged occupation + # probability, and using this as an initial-prob, since the boundaries of + # chunks don't really correspond to utterance boundaries in general.] + Also set alpha_r(0, i) = 0. + - For t = 1 ... T: + for i = 0 ... I-1: + alpha(t, i) = 0 + alpha_r(t, i) = 0 + for (j, p, n) in pred(i): # note: j is preceding-state. + alpha(t, i) += x(t-1, n) * alpha(t-1, j) * p + alpha_r(t, i) += (alpha_r(t-1, j) + (ref_pdf == pdf ? 1.0 : 0.0)) * alpha(t-1, j) * x(t-1, n) * p + alpha_r(t, i) /= alpha(t, i) + + - total-prob = \sum_i alpha(T, i). # note, we take the final-probs of all states + # to be 1.0. + - total-objf = \sum_i alpha(T, i) * alpha_r(T, i) / total-prob + + * Backward computation (version 1) + + And now for the backward computation. Contrary to tradition, we include the + inverse of the total-prob as a factor in the betas. This is both more + convenient (it simplifies the way we obtain posteriors), and makes the + algorithm more generalizable as all the beta quantities can be interpreted as + the partial derivative of the logprob with respect to their corresponding + alpha. + + In forward backward notation, gamma is normally used for state-level + occupation probabilities, but what we care about here is pdf-id-level + occupation probabilities (i.e. the partial derivative of the log-likelihood + w.r.t. the logs of the x(t, n) quantities), so we use gamma for that. + + - for the final frame: + for each i, beta(T, i) = 1 / total-prob. + for each i, beta_r(T, i) = 0 + - for t = T-1 ... 0: + for i = 0 ... I-1: + beta(t, i) = 0 + beta_r(t, i) = 0 + for (j, p, n) in foll(i): # note: j is following-state. + beta(t, i) += x(t, n) * beta(t+1, j) * p. + beta_r(t, i) += (beta_r(t+1, j) + (ref_pdf == pdf ? 1.0 : 0)) * beta(t+1, j) * x(t, n) * p + gamma(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p. + gamma_r(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p * (alpha_r(t, i) + (ref_pdf == pdf ? 1.0 : 0) + beta_r(t+1, j) - tot_objf) + beta_r(t, i) /= beta(t, i) + + ** Version 2 of the computation (renormalized version) ** + + Version 1 of the algorithm is susceptible to numeric underflow and overflow, + due to the limited range of IEEE floating-point exponents. + Define tot-alpha(t) = \sum_i alpha(t, i). Then the renormalized version of + the computation is as above, except whenever the quantity x(t, n) appears, + we replace it with x(t, n) / tot-alpha(t). In the algorithm we refer to + 1.0 / tot-alpha(t) as 'arbitrary_scale', because mathematically we can use any + value here as long as we are consistent and the value only varies with t + and not with n; we'll always get the same posteriors (gamma). + + When the algorithm outputs log(total-prob) as the total log-probability + of the HMM, we have to instead return the expression: + log(total-prob) + \sum_{t=0}^{T-1} tot-alpha(t). + to correct for the scaling of the x values. + + The algorithm is still vulnerable to overflow in the beta computation because + it's possible that the dominant path could have a very tiny alpha. However, + once we introduce the leaky-HMM idea (below), this problem will disappear. + + ** Version 3 of the computation (leaky-HMM version) ** + + The leaky-HMM idea is intended to improve generalization by allowing paths + other than those explicitly allowed by the FST we compiled. Another way to + look at it is as a way of hedging our bets about where we split the utterance, + so it's as we're marginalizing over different splits of the utterance. You + could also think of it as a modification of the FST so that there is an + epsilon transition from each state to a newly added state, with probability + one, and then an epsilon transition from the newly added state to each state + with probability leaky-hmm-prob * init(i) [except we need a mechanism so that + no more than two epsilon transitions can be taken per frame- this would involve + creating two copies of the states] + + Recall that we mentioned that init(i) is the initial-probability of + HMM-state i, but these are obtained in such a way that they can be treated + as priors, or average occupation-probabilities. + + Anyway, the way we formulate leaky-hmm is as follows: + + * Forward computation (version 3) + + Let leaky-hmm-prob be a constant defined by the user, with 0.1 being a typical + value. It defines how much probability we give to the 'leaky' transitions. + + - For frame 0, set alpha(0, i) = init(i), alpha_r(0, i) = 0 + - For 0 <= t <= T, define tot-alpha(t) = \sum_i alpha(t, i). + - For 0 <= t <= T, define alpha'(t, i) = alpha(t, i) + tot-alpha(t) * leaky-hmm-prob * init(i). + + - For 1 <= t <= T, the computation of alpha(t, i) is as before except we use + the previous frame's alpha' instead of alpha. That is: + alpha(t, i) = 0 + alpha_r(t, i) = 0 + for (j, p, n) in pred(i): # note: j is preceding-state. + alpha(t, i) += alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1) + alpha_r(t, i) += (alpha_r(t-1, j) + (ref_pdf == pdf ? 1.0 : 0.0)) * alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1) + alpha_r(t, i) /= alpha(t,i) + + - total-prob = \sum_i alpha'(T, i) + + - total-objf = \sum_i alpha'(T, i) * alpha_r(T, i) / total-prob + + The corrected log-prob that we return from the algorithm will be + (total-prob + \sum_{t=0}^{T-1} tot-alpha(t)). + + * Backward computation (version 3) + + The backward computation is as follows. It is fairly straightforward to + derive if you think of it as an instance of backprop where beta, tot-beta and + beta' are the partial derivatives of the output log-prob w.r.t. the + corresponding alpha, tot-alpha and alpha' quantities. Note, tot-beta is not + really the sum of the betas as its name might suggest, it's just the + derivative w.r.t. tot-alpha. + + - beta'(T, i) = 1 / total-prob. + - beta_r(T, i) = 0 + - for 0 <= t <= T, define tot-beta(t) = leaky-hmm-prob * \sum_i init(i) * beta'(t, i) + - for 0 <= t <= T, define beta(t, i) = beta'(t, i) + tot-beta(t). + - for 0 <= t < T, we compute beta'(t, i) and update gamma(t, n) as follows: + for 0 <= i < I: + beta'(t, i) = 0 + for (j, p, n) in foll(i): # note: j is following-state. + beta'(t, i) += beta(t+1, j) * p * x(t, n) / tot-alpha(t) + beta_r(t, i) += (beta_r(t+1, j) + (ref_pdf == pdf ? 1.0 : 0)) * beta(t+1, j) * x(t, n) / tot-alpha(t) * p + gamma(t, n) += alpha'(t, i) * beta(t+1, j) * p * x(t, n) / tot-alpha(t) + gamma_r(t, n) += alpha'(t, i) * x(t, n) / tot-alpha(t) * beta(t+1, j) * p * (alpha_r(t, i) + (ref_pdf == pdf ? 1.0 : 0.0) + beta_r(t+1, j) - tot_objf) + beta_r(t, i) /= beta(t, i) + + Note: in the code, the tot-alpha and tot-beta quantities go in the same + memory location that the corresponding alpha and beta for state I would go. + + */ + +class DenominatorSmbrComputation { + public: + /* + Constructor. 'nnet_output' is the raw nnet output (which we'll treat as + pseudo-log-likelihoods). + + @param [in] opts The options. + @param [in] graph The HMM that we use for the denominator (like a decoding graph, + with pdf-ids on the transitions). + @param [in] num_sequences The number of separate time sequences (all of the same length) + that we are working with. Must divide nnet_output.NumRows(). + @param [in] nnet_output The output of the neural network for this minibatch. + The rows must be ordered as (first frame of all sequences) + (second frame of all sequences), etc. + */ + DenominatorSmbrComputation(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + int32 num_sequences, + const CuMatrixBase &nnet_output, + const CuMatrixBase &num_posteriors); + + // Does the forward computation, and returns the total objective summed + // over all sequences. You will have to scale this by any supervision + // weighting factor, manually. + // aux_objf stores the value of the auxiliary MMI objective scaled by + // opts.mmi_factor + BaseFloat ForwardSmbr(BaseFloat *aux_objf); + + // this adds deriv_weight times (the derivative of the objective w.r.t. the + // nnet output), to 'nnet_output_deriv'. + // returns true if everything seemed OK, false if a failure was detected. + bool BackwardSmbr(BaseFloat deriv_weight, + CuMatrixBase *nnet_output_deriv); + + private: + // Defining this constant as an enum is easier. it controls a memory/speed + // tradeoff, determining how many frames' worth of the transposed derivative + // we store at a time. It's not very critical; the only disadvantage from + // setting it small is that we have to invoke an AddMat kernel more times. + enum { kMaxDerivTimeSteps = 8 }; + + // sets up the alpha for frame t = 0. + void AlphaSmbrFirstFrame(); + // the alpha computation for some 0 < t <= num_time_steps_. + void AlphaSmbrGeneralFrame(int32 t); + // does the 'alpha-dash' computation for time t. this relates to + // 'leaky hmm'. + void AlphaSmbrDash(int32 t); + + // done after all the alphas, this function computes and returns the total + // smbr objective summed over all the sequences, and sets tot_prob_ (if we're + // doing correction) log_correction_term_. Note, this won't be scaled by + // 'deriv_scale' (which of course we haven't seen by the time this is called, + // from the ForwardSmbr() computation). + // aux_objf stores the value of the auxiliary MMI objective scaled by + // opts.mmi_factor + BaseFloat ComputeTotObjf(BaseFloat *aux_objf); + + void BetaSmbrDashLastFrame(); + // beta computation for 0 <= beta < num_time_steps_. + void BetaSmbrDashGeneralFrame(int32 t); + // compute the beta quantity from the beta-dash quantity (relates to leaky hmm). + void BetaSmbr(int32 t); + + // some checking that we can do if debug mode is activated, or on frame zero. + // Sets ok_ to false if a bad problem is detected. + void BetaSmbrGeneralFrameDebug(int32 t); + + const ChainTrainingOptions &opts_; + const DenominatorGraph &den_graph_; + + // number of separate frame sequences + int32 num_sequences_; + // number of frames per sequence. nnet_output_.NumRows() equals + // num_sequences_ * frames_per_sequence. + int32 frames_per_sequence_; + + // The transpose of the exp() of the nnet output (the transpose is more + // convenient for memory locality, and the exp() avoids us having to + // exponentiate in the forward-backward). + // + // The row-index is the pdf-id; and the column index equals (frame_index * + // num_sequences + sequence_index). + CuMatrix exp_nnet_output_transposed_; + + // the numerator posterior probabilities + // The row-index is the pdf-id; and the column index equals (frame_index * + // num_sequences + sequence_index). + CuMatrix numerator_posteriors_transposed_; + + // the smbr derivs w.r.t. the nnet outputs (transposed) + CuMatrix nnet_output_acc_deriv_transposed_; + + // the log-prob derivs w.r.t. the nnet outputs (transposed) + CuMatrix nnet_output_log_prob_deriv_transposed_; + + // the (temporarily) alpha and (more permanently) alpha-dash probabilities; + // dimension is (frames_per_sequence + 1) by (num-hmm-states * num-sequences + + // num_sequences). Note, they are not logs. The last 'num_sequences' + // columns, where the alpha for the state indexed 'num_hmm_states' would live, + // are for the alpha-sums, which relates to leaky HMM. + CuMatrix alpha_; + + // the analogous alpha quantities for the SMBR objective + CuMatrix alpha_smbr_; + + // the beta (also beta-dash) probabilities (rolling buffer); dimension is 2 * + // (num-hmm-states * num-sequences + num_sequences). [the last + // 'num_sequences' columns are for the beta-sums, which relates to leaky HMM.] + // Note: for efficiency and to simplify the equations, these are actually the + // beta / tot_prob_. + CuMatrix beta_; + + // the analogous beta quantities for the SMBR objective + CuMatrix beta_smbr_; + + // the total probability for each sequence, excluding the product of + // correction terms. [the correction terms refer to the fact that we multiply + // on each frame by 1/alpha of hmm-state 0 of the previous frame.]. + // After the correction terms the total probability is fairly close to 1, + // which is why we can store it as non-log. + CuVector tot_prob_; + + // the total smbr for each sequence. + CuVector tot_smbr_; + + // the log of tot_prob_. + CuVector tot_log_prob_; + + // the log of the total correction term for each sequence, which is the + // product of the alpha-sums [used in the leaky-hmm computation] over all the + // frames. The 'correction terms' are terms that we divide the alphas and + // betas by in order to keep them in a good dynamic range. The product of + // them must be included in the total likelihood. + CuVector log_correction_term_; + + bool ok_; + + BaseFloat leaky_hmm_coefficient_ = 1e-05; +}; + + + +} // namespace chain +} // namespace kaldi + +#endif // KALDI_CHAIN_CHAIN_DENOMINATOR_H_ + diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc index e41e942e266..c6f6de7fdf1 100644 --- a/src/chain/chain-denominator.cc +++ b/src/chain/chain-denominator.cc @@ -425,6 +425,12 @@ void DenominatorComputation::BetaGeneralFrameDebug(int32 t) { KALDI_WARN << "Excessive error detected, will abandon this minibatch"; ok_ = false; } + } else { + KALDI_VLOG(1) << "On time " << t << ", alpha-beta product = " + << alpha_beta_product + << ", alpha-dash-sum = " << this_alpha_dash.Sum() + << ", beta-dash-sum = " << this_beta_dash.Sum(); + } // use higher tolerance, since we are using randomized pruning for the // log-prob derivatives. diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h index 388c78ab2ee..d2040b6edc2 100644 --- a/src/chain/chain-kernels-ansi.h +++ b/src/chain/chain-kernels-ansi.h @@ -48,6 +48,41 @@ extern "C" { const BaseFloat *prev_alpha, BaseFloat *this_alpha); + void cuda_chain_smbr_hmm_backward(dim3 Gr, dim3 Bl, + const Int32Pair *forward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, + int32_cuda prob_stride, + const BaseFloat *num_post, + int32_cuda post_stride, + const BaseFloat *tot_smbr, + const BaseFloat *this_alpha, + const BaseFloat *this_alpha_smbr, + const BaseFloat *next_beta, + const BaseFloat *next_beta_smbr, + BaseFloat *this_beta, + BaseFloat *this_beta_smbr, + BaseFloat *acc_deriv, + int32_cuda acc_deriv_stride, + BaseFloat *log_prob_deriv, + int32_cuda log_prob_deriv_stride); + + void cuda_chain_smbr_hmm_forward(dim3 Gr, dim3 Bl, + const Int32Pair *backward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, + int32_cuda prob_stride, + const BaseFloat *num_post, + int32_cuda post_stride, + const BaseFloat *prev_alpha, + const BaseFloat *prev_alpha_smbr, + BaseFloat *this_alpha, + BaseFloat *this_alpha_smbr); + } // extern "C" #endif // HAVE_CUDA diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index f093f21a5a5..8ed7bd54a98 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -169,7 +169,7 @@ static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions, // range. This won't affect the posteriors, as it's just a constant factor // for each frame, but when computing the total likelihood we'll need to // compensate for it later on. - BaseFloat arbitrary_scale = + BaseFloat arbitrary_scale = 1.0 / prev_alpha[num_hmm_states * num_sequences + s]; this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; } @@ -256,6 +256,7 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions, } + void cuda_chain_hmm_forward(dim3 Gr, dim3 Bl, const Int32Pair *backward_transitions, const DenominatorGraphTransition *transitions, @@ -287,3 +288,4 @@ void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl, this_beta, log_prob_deriv, log_prob_deriv_stride); } + diff --git a/src/chain/chain-numerator.cc b/src/chain/chain-numerator.cc index 139d28bdd77..973ceb352a6 100644 --- a/src/chain/chain-numerator.cc +++ b/src/chain/chain-numerator.cc @@ -148,6 +148,7 @@ BaseFloat NumeratorComputation::Forward() { void NumeratorComputation::Backward( + BaseFloat weight, CuMatrixBase *nnet_output_deriv) { const fst::StdVectorFst &fst = supervision_.fst; int32 num_states = fst.NumStates(); @@ -204,7 +205,8 @@ void NumeratorComputation::Backward( // copy this data to GPU. CuVector nnet_logprob_deriv_cuda; nnet_logprob_deriv_cuda.Swap(&nnet_logprob_derivs_); - nnet_output_deriv->AddElements(supervision_.weight, nnet_output_indexes_, + nnet_output_deriv->AddElements(supervision_.weight * weight, + nnet_output_indexes_, nnet_logprob_deriv_cuda.Data()); } diff --git a/src/chain/chain-numerator.h b/src/chain/chain-numerator.h index 15cb31e0571..691d9d72085 100644 --- a/src/chain/chain-numerator.h +++ b/src/chain/chain-numerator.h @@ -78,7 +78,7 @@ class NumeratorComputation { // Does the backward computation and (efficiently) adds the derivative of the // nnet output w.r.t. the (log-prob times supervision_.weight times // deriv_weight) to 'nnet_output_deriv'. - void Backward(CuMatrixBase *nnet_output_deriv); + void Backward(BaseFloat weight, CuMatrixBase *nnet_output_deriv); private: diff --git a/src/chain/chain-smbr-kernels.cu b/src/chain/chain-smbr-kernels.cu new file mode 100644 index 00000000000..b7fef9b6fb0 --- /dev/null +++ b/src/chain/chain-smbr-kernels.cu @@ -0,0 +1,366 @@ +// chain/chain-kernels.cu + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include +#include "chain/chain-kernels-ansi.h" + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200 +#error - Kaldi no longer supports CC1.x devices. Please use a newer GPU or \ + configure with --use-cuda=no (this will disable the use of GPU). +#endif + + +#ifdef __CUDACC__ +#if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 ) +// native implementation available +#else +#if __CUDA_ARCH__ >= 600 +#error using CAS implementation of double atomicAdd +#endif +__device__ double atomicAdd(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*) address; + unsigned long long int old = *address_as_ull, assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __longlong_as_double(old); +} +#endif +#endif + + +template +__device__ inline void atomic_add(Real* address, Real value) { + atomicAdd(address, value); +} + +template +__device__ inline void atomic_add_thresholded(Real* address, Real value) { + // This function uses a randomized algorithm to only do atomic adds for values + // with absolute value >= a threshold, + // and if it's below the threshold, randomly add the + // threshold itself with probability (value / threshold). This preserves + // expectations. + + // kThresholdingPowerOfTwo is defined in chain-datastruct.h; it defines + // the threshold for randomized posterior pruning. + const Real threshold = 1.0 / (1 << kThresholdingPowerOfTwo); + Real abs_value = abs(value); + if (abs_value >= threshold) { + atomic_add(address, value); + } else { + // The intention here is to do: + // with probability(value / threshold), do: + // atomic_add(address, threshold); + // We use the least significant bits of the value as a source of + // randomness. It would probably be more efficient to extract these + // random bits directly from the float, but I don't want to have to + // deal with endian-ness issues. + // + // below, x is a fixed-point representation of (value / threshold); it would + // be 16777216 == 2^24 if value == threshold and 0 if value == 0. We choose + // the power 24 because that's the number of binary digits in the mantissa + // in IEEE single precision floating point. + // Note: we parenthesize the expression like this so that the + // denominator can be precomputed as a constant expression. + int32_cuda x = abs_value / (threshold / (1 << 24)); + // in the line below, the expression (x >> 12) is a representation of (value / + // threshold) between 0 and 4096, with 4096 representing (value / threshold == + // 1), while (x & 4095) is treated as a pseudorandom number between 0 and 4095. + if ((x >> 12) > (x & 4095)) { + if (value >= 0) atomic_add(address, threshold); + else atomic_add(address, -threshold); + } + } +} + +// one iteration of the forward computation in the chain HMM with +// SMBR objective. +// The grid y determines which HMM-state we handle. [put this in the grid because +// HMM-states don't all take the same amount of time in the backwards direction, and it's +// better for scheduling to have them at the outer level.] +// The block x and grid x determine which sequence (0 ... num_sequences - 1) we handle; +// note that num_sequences == the number of elements in the minibatch, and we +// insist they all have the same number of time steps. +// note: 'probs' is indexed by sequence-index + (pdf-index * prob_stride). +// note: 'num_post' is indexed by sequence-index + (pdf-index * post_stride). +__global__ +static void _cuda_chain_smbr_hmm_forward( + const Int32Pair *backward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *num_post, int32_cuda post_stride, + const BaseFloat *prev_alpha, const BaseFloat *prev_alpha_smbr, + BaseFloat *this_alpha, BaseFloat *this_alpha_smbr) { + // 'backward_transitions', indexed by hmm-state, consists of [start, end] + // indexes into the 'transitions' array. This gives us the info for + // transitions *into* this state. 'probs' contains the exponentiated neural + // net outputs; it has dimension num-output-indexes by num_sequences and its + // stride is 'prob_stride'. 'prev_alpha' and 'this_alpha', which are + // extracted from a larger matrix, both have dimension num-history-states by + // num-sequences. 'prev_alpha_smbr' and 'this_alpha_smbr' are analogous + // for the partial SMBR values. + + // s is the index of the sequence within the minibatch, + // from 0 .. num-egs-in-this-minibatch - 1. + // h is the hmm-state index. + int32_cuda s = threadIdx.x + blockIdx.x * blockDim.x, + h = blockIdx.y; + if (s >= num_sequences) + return; + + double this_tot_alpha = 0.0, this_tot_alpha_smbr = 0.0; + const DenominatorGraphTransition + *trans_iter = transitions + backward_transitions[h].first, + *trans_end = transitions + backward_transitions[h].second; + // Note: regarding this loop unrolling, I tried the automatic unrolling using + // #pragma unroll 2 (after modifying the loop to have an integer index), but I + // did not see any performance improvement, it was slightly slower. So the + // compiler must be doing something different than what I'm doing here. + const int loop_unroll = 2; // don't change this without changing the code + // below. + for (; trans_iter + loop_unroll <= trans_end; trans_iter += loop_unroll) { + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + prev_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat transition_prob1 = trans_iter[1].transition_prob; + int32_cuda pdf_id1 = trans_iter[1].pdf_id, + prev_hmm_state1 = trans_iter[1].hmm_state; + BaseFloat pseudo_loglike0 = probs[pdf_id0 * prob_stride + s], + num_post0 = num_post[pdf_id0 * post_stride + s], + this_prev_alpha0 = prev_alpha[prev_hmm_state0 * num_sequences + s], + this_prev_alpha_smbr0 = + prev_alpha_smbr[prev_hmm_state0 * num_sequences + s], + pseudo_loglike1 = probs[pdf_id1 * prob_stride + s], + num_post1 = num_post[pdf_id1 * post_stride + s], + this_prev_alpha1 = prev_alpha[prev_hmm_state1 * num_sequences + s], + this_prev_alpha_smbr1 = + prev_alpha_smbr[prev_hmm_state1 * num_sequences + s]; + + this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0 + + this_prev_alpha1 * transition_prob1 * pseudo_loglike1; + this_tot_alpha_smbr += + (this_prev_alpha_smbr0 + num_post0) * this_prev_alpha0 + * transition_prob0 * pseudo_loglike0 + + (this_prev_alpha_smbr1 + num_post1) * this_prev_alpha1 + * transition_prob1 * pseudo_loglike1; + } + if (trans_iter != trans_end) { + // mop up the odd transition. + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + prev_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat pseudo_loglike0 = probs[pdf_id0 * prob_stride + s], + num_post0 = num_post[pdf_id0 * post_stride + s], + this_prev_alpha0 = prev_alpha[prev_hmm_state0 * num_sequences + s], + this_prev_alpha_smbr0 = + prev_alpha_smbr[prev_hmm_state0 * num_sequences + s]; + this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0; + this_tot_alpha_smbr += + (this_prev_alpha_smbr0 + num_post0) * this_prev_alpha0 + * transition_prob0 * pseudo_loglike0; + } + + // Let arbitrary_scale be the inverse of the sum of all alpha values on-- the + // previous frame this sum of all the alpha values is stored in the place that + // we'd store the previous alpha for state-index equal to num_hmm_states + // (i.e. one past the end). We multiply this into all the + // transition-probabilities from the previous frame to this frame, in both the + // forward and backward passes, in order to keep the alphas in a good numeric + // range. This won't affect the posteriors, as it's just a constant factor + // for each frame, but when computing the total likelihood we'll need to + // compensate for it later on. + BaseFloat arbitrary_scale = + 1.0 / prev_alpha[num_hmm_states * num_sequences + s]; + this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; + if (this_tot_alpha > 0.0) + this_alpha_smbr[h * num_sequences + s] = + this_tot_alpha_smbr / this_tot_alpha; + else + this_alpha_smbr[h * num_sequences + s] = 0.0; +} + + +__global__ +static void _cuda_chain_smbr_hmm_backward( + const Int32Pair *forward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *num_post, int32_cuda post_stride, + const BaseFloat *tot_smbr, + const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, + const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, + BaseFloat *this_beta, BaseFloat *this_beta_smbr, + BaseFloat *acc_deriv, int32_cuda acc_deriv_stride, + BaseFloat *log_prob_deriv, int32_cuda log_prob_deriv_stride) { + // 'forward_transitions', indexed by hmm-state, consists of [start, end] + // indexes into the 'transition_info' array. This is about the transitions + // *out of* this state. 'probs' contains the exponentiated neural net + // outputs; it has dimension num-output-indexes by num_sequences, and contains + // just the observation probabilities for this time index. Its stride is + // prob_stride. + // 'this_alpha', 'next_beta' and 'this_beta' all have dimension + // num-history-states by num-sequences. + // 'this_alpha_smbr', 'next_beta_smbr', and 'this_beta_smbr' are + // analogous quantities storing values for SMBR objective. + // The beta probs are normalized in such a way (by multiplying by 1/(total-data-prob)) + // that to get occupation counts we don't need to multiply by 1/total-data-prob. + // deriv_scale is a factor (e.g. -1.0 or -0.99) that we multiply these derivs by + // while accumulating them. + + // s is the index of the sequence within the minibatch, + // from 0 .. num-egs-in-this-minibatch - 1. + // h is the hmm-state index. + int32_cuda s = threadIdx.x + blockIdx.x * blockDim.x, + h = blockIdx.y; + if (s >= num_sequences) + return; + + // See where arbitrary_scale is defined in the forward computation above, for + // more explanation of inv_arbitrary_scale. + BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s], + this_alpha_smbr_i = this_alpha_smbr[h * num_sequences + s], + inv_arbitrary_scale = + this_alpha[num_hmm_states * num_sequences + s]; + double tot_variable_factor = 0.0, tot_beta_smbr = 0.0; + + BaseFloat occupation_factor = this_alpha_prob / inv_arbitrary_scale; + const DenominatorGraphTransition + *trans_iter = transitions + forward_transitions[h].first, + *trans_end = transitions + forward_transitions[h].second; + const int loop_unroll = 2; // don't change this without changing the code + // below. + for (; trans_iter + loop_unroll <= trans_end; trans_iter += loop_unroll) { + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + next_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat transition_prob1 = trans_iter[1].transition_prob; + int32_cuda pdf_id1 = trans_iter[1].pdf_id, + next_hmm_state1 = trans_iter[1].hmm_state; + BaseFloat next_beta_j0 = next_beta[next_hmm_state0 * num_sequences + s], + next_beta_smbr_j0 = next_beta_smbr[next_hmm_state0 * num_sequences + s], + next_beta_j1 = next_beta[next_hmm_state1 * num_sequences + s], + next_beta_smbr_j1 = next_beta_smbr[next_hmm_state1 * num_sequences + s], + prob0 = probs[pdf_id0 * prob_stride + s], + prob1 = probs[pdf_id1 * prob_stride + s], + num_post0 = num_post[pdf_id0 * post_stride + s], + num_post1 = num_post[pdf_id1 * post_stride + s]; + + BaseFloat variable_factor0 = transition_prob0 * next_beta_j0 * prob0, + variable_factor1 = transition_prob1 * next_beta_j1 * prob1; + tot_beta_smbr += (next_beta_smbr_j0 + num_post0) * variable_factor0 + + (next_beta_smbr_j1 + num_post1) * variable_factor1; + tot_variable_factor += variable_factor0 + variable_factor1; + BaseFloat occupation_prob0 = variable_factor0 * occupation_factor; + BaseFloat this_acc_r0 = occupation_prob0 + * (this_alpha_smbr_i + num_post0 + next_beta_smbr_j0); + atomic_add(acc_deriv + (pdf_id0 * acc_deriv_stride + s), + this_acc_r0); + atomic_add(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), + occupation_prob0); + BaseFloat occupation_prob1 = variable_factor1 * occupation_factor; + BaseFloat this_acc_r1 = occupation_prob1 + * (this_alpha_smbr_i + num_post1 + next_beta_smbr_j1); + atomic_add(acc_deriv + (pdf_id1 * acc_deriv_stride + s), + this_acc_r1); + atomic_add(log_prob_deriv + (pdf_id1 * log_prob_deriv_stride + s), + occupation_prob1); + } + if (trans_iter != trans_end) { + // mop up the odd transition. + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + next_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat next_beta_j0 = next_beta[next_hmm_state0 * num_sequences + s], + next_beta_smbr_j0 = next_beta_smbr[next_hmm_state0 * num_sequences + s], + prob0 = probs[pdf_id0 * prob_stride + s], + num_post0 = num_post[pdf_id0 * post_stride + s]; + BaseFloat variable_factor0 = transition_prob0 * next_beta_j0 * prob0; + tot_beta_smbr += (next_beta_smbr_j0 + num_post0) * variable_factor0; + tot_variable_factor += variable_factor0; + BaseFloat occupation_prob0 = variable_factor0 * occupation_factor; + BaseFloat this_acc_r0 = occupation_prob0 + * (this_alpha_smbr_i + num_post0 + next_beta_smbr_j0); + atomic_add(acc_deriv + (pdf_id0 * acc_deriv_stride + s), + this_acc_r0); + atomic_add(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), + occupation_prob0); + } + BaseFloat beta = tot_variable_factor / inv_arbitrary_scale; + this_beta[h * num_sequences + s] = beta; + if (tot_variable_factor > 0.0) + this_beta_smbr[h * num_sequences + s] = + tot_beta_smbr / tot_variable_factor; + else + this_beta_smbr[h * num_sequences + s] = 0.0; +} + + +// Chain forward with SMBR objective +void cuda_chain_smbr_hmm_forward( + dim3 Gr, dim3 Bl, + const Int32Pair *backward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *num_post, int32_cuda post_stride, + const BaseFloat *prev_alpha, const BaseFloat *prev_alpha_smbr, + BaseFloat *this_alpha, BaseFloat *this_alpha_smbr) { + _cuda_chain_smbr_hmm_forward<<>>( + backward_transitions, transitions, + num_sequences, num_hmm_states, + probs, prob_stride, num_post, post_stride, + prev_alpha, prev_alpha_smbr, this_alpha, this_alpha_smbr); +} + +void cuda_chain_smbr_hmm_backward( + dim3 Gr, dim3 Bl, + const Int32Pair *forward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *num_post, int32_cuda post_stride, + const BaseFloat *tot_smbr, + const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, + const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, + BaseFloat *this_beta, BaseFloat *this_beta_smbr, + BaseFloat *acc_deriv, + int32_cuda acc_deriv_stride, + BaseFloat *log_prob_deriv, + int32_cuda log_prob_deriv_stride) { + _cuda_chain_smbr_hmm_backward<<>>( + forward_transitions, transitions, + num_sequences, num_hmm_states, + probs, prob_stride, num_post, post_stride, tot_smbr, + this_alpha, this_alpha_smbr, next_beta, next_beta_smbr, + this_beta, this_beta_smbr, + acc_deriv, acc_deriv_stride, + log_prob_deriv, log_prob_deriv_stride); +} diff --git a/src/chain/chain-supervision-splitter-test.cc b/src/chain/chain-supervision-splitter-test.cc new file mode 100644 index 00000000000..abf0264dde6 --- /dev/null +++ b/src/chain/chain-supervision-splitter-test.cc @@ -0,0 +1,306 @@ +// chain/chain-supervision-splitter-test.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2017 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "chain/chain-supervision-splitter.h" +#include "chain/chain-supervision.h" +#include "fstext/fstext-lib.h" +#include "hmm/hmm-test-utils.h" +#include "hmm/hmm-utils.h" +#include +#include "fstext/kaldi-fst-io.h" +#include "lat/lattice-functions.h" + +namespace kaldi { +namespace chain { + + +void FstToLabels(const fst::StdVectorFst &fst, + std::vector > *labels) { + std::vector state_times; + int32 num_frames = ComputeFstStateTimes(fst, &state_times); + + typedef fst::StdArc::Weight Weight; + typedef fst::StdArc::StateId StateId; + typedef fst::StdArc::Label Label; + + std::vector > temp_labels(num_frames); + labels->clear(); + labels->resize(num_frames); + + for (StateId s = 0; s < fst.NumStates(); s++) { + for (fst::ArcIterator aiter(fst, s); + !aiter.Done(); aiter.Next()) { + const fst::StdArc &arc = aiter.Value(); + + int32 t = state_times[s]; + KALDI_ASSERT(arc.ilabel == arc.olabel && arc.ilabel != 0); + + temp_labels[t].insert(arc.olabel); + } + } + + int32 t = 0; + for (std::vector >::const_iterator it = temp_labels.begin(); + it != temp_labels.end(); ++it, t++) { + (*labels)[t].Init(*it); + } +} + +void TestSupervisionLatticeSplitting( + const SupervisionOptions &sup_opts, + const TransitionModel &trans_model, + Lattice &lat) { + + fst::TopSort(&lat); + + chain::SupervisionLatticeSplitterOptions opts; + chain::SupervisionLatticeSplitter sup_lat_splitter( + opts, sup_opts, trans_model); + sup_lat_splitter.LoadLattice(lat); + + std::vector state_times; + int32 num_frames_lat = LatticeStateTimes(lat, &state_times); + + Posterior post; + LatticeForwardBackward(lat, &post); + + KALDI_ASSERT(num_frames_lat == post.size()); + + std::vector > pdfs(post.size()); + for (size_t i = 0; i < post.size(); i++) { + std::vector this_pdfs; + for (size_t j = 0; j < post[i].size(); j++) { + this_pdfs.push_back(trans_model.TransitionIdToPdf(post[i][j].first) + 1); + } + pdfs[i].Init(this_pdfs); + } + + for (int32 i = 0; i < 3; i++) { + int32 start_frame = RandInt(0, num_frames_lat - 1), + num_frames = RandInt(1,10); + + if (start_frame + num_frames > num_frames_lat) { + num_frames = num_frames_lat - start_frame; + } + + chain::Supervision supervision_part; + sup_lat_splitter.GetFrameRangeSupervision( + start_frame, num_frames, &supervision_part); + + std::vector > labels; + FstToLabels(supervision_part.fst, &labels); + + KALDI_ASSERT(labels.size() == num_frames); + + for (int32 t = 0; t < labels.size(); t++) { + for (ConstIntegerSet::iterator it = labels[t].begin(); + it != labels[t].end(); ++it) { + // To check that each label is a pdf (1-indexed) within the tolerance + // in the original + bool label_in_original = false; + for (int32 n = std::max(start_frame + t - sup_opts.left_tolerance, 0); + n <= std::min(start_frame + t + sup_opts.right_tolerance, num_frames_lat - 1); + n++) { + if (pdfs[n].count(*it)) { + label_in_original = true; + break; + } + } + KALDI_ASSERT(label_in_original); + } + } + + std::vector self_loop_pdfs_list; + for (int32 tid = 1; tid <= trans_model.NumTransitionIds(); tid++) { + if (trans_model.IsSelfLoop(tid)) { + int32 tstate = trans_model.TransitionIdToTransitionState(tid); + int32 pdf = trans_model.TransitionStateToSelfLoopPdf(tstate); + self_loop_pdfs_list.push_back(pdf); + } + } + + ConstIntegerSet self_loop_pdfs(self_loop_pdfs_list); + + // To check that each self-loop pdf in the original is contained as a label + // in at least 2 of the tolerance values of the split lattices. + for (int32 n = start_frame; n < start_frame + num_frames; n++) { + for (ConstIntegerSet::iterator it = pdfs[n].begin(); + it != pdfs[n].end(); ++it) { + if (!self_loop_pdfs.count(*it - 1)) continue; // Ignore forward pdfs + int32 pdf_count = 0; + for (int32 t = std::max(n - start_frame - sup_opts.left_tolerance, 0); + t <= std::min(n - start_frame + sup_opts.right_tolerance, num_frames - 1); t++) { + pdf_count += labels[t].count(*it); + } + //KALDI_ASSERT(pdf_count > 1); + } + } + } +} + +TransitionModel* GetSimpleChainTransitionModel( + ContextDependency **ctx_dep, int32 num_phones) { + + std::ostringstream oss; + + oss << "\n" + "\n" + " "; + for (int32 i = 1; i <= num_phones; i++) { + oss << i << " "; + } + oss << "\n" + " 0 0 1\n" + " 0 0.5\n" + " 1 0.5\n" + " \n" + " 1 \n" + "\n" + "\n"; + + std::string chain_input_str = oss.str(); + + HmmTopology topo; + std::istringstream iss(chain_input_str); + topo.Read(iss, false); + + const std::vector &phones = topo.GetPhones(); + + std::vector phone2num_pdf_classes (1+phones.back()); + for (size_t i = 0; i < phones.size(); i++) + phone2num_pdf_classes[phones[i]] = topo.NumPdfClasses(phones[i]); + + *ctx_dep = MonophoneContextDependency(phones, phone2num_pdf_classes); + + return new TransitionModel(**ctx_dep, topo); +} + +void ChainSupervisionSplitterTest(int32 index) { + ContextDependency *ctx_dep; + TransitionModel *trans_model; + + if (Rand()) + trans_model = GenRandTransitionModel(&ctx_dep, 2); + else + trans_model = GetSimpleChainTransitionModel(&ctx_dep, 2); + + const std::vector &phones = trans_model->GetPhones(); + + int32 subsample_factor = 1; + + int32 phone_sequence_length = RandInt(1, 10); + + CompactLattice clat; + int32 cur_state = clat.AddState(); + clat.SetStart(cur_state); + + bool reorder = true; + + int32 num_frames_subsampled = 0; + for (int32 i = 0; i < phone_sequence_length; i++) { + int32 phone = phones[RandInt(0, phones.size() - 1)]; + int32 next_state = clat.AddState(); + + std::vector tids; + GenerateRandomAlignment(*ctx_dep, *trans_model, reorder, + std::vector(1, phone), &tids); + clat.AddArc(cur_state, + CompactLatticeArc(phone, phone, + CompactLatticeWeight(LatticeWeight::One(), + tids), next_state)); + cur_state = next_state; + num_frames_subsampled += tids.size(); + } + clat.SetFinal(cur_state, CompactLatticeWeight::One()); + + Lattice lat; + fst::ConvertLattice(clat, &lat); + + chain::SupervisionOptions sup_opts; + sup_opts.left_tolerance = 1; + sup_opts.right_tolerance = 1; + sup_opts.frame_subsampling_factor = subsample_factor; + sup_opts.lm_scale = 0.5; + + fst::StdVectorFst tolerance_fst; + GetToleranceEnforcerFst(sup_opts, *trans_model, &tolerance_fst); + WriteFstKaldi(std::cerr, false, tolerance_fst); + + TestSupervisionLatticeSplitting(sup_opts, *trans_model, lat); + + delete ctx_dep; + delete trans_model; +} + +void TestToleranceFst(chain::SupervisionOptions &sup_opts, int32 num_phones) { + ContextDependency *ctx_dep; + TransitionModel *trans_model = GetSimpleChainTransitionModel(&ctx_dep, num_phones); + + fst::StdVectorFst tolerance_fst; + GetToleranceEnforcerFst(sup_opts, *trans_model, &tolerance_fst); + WriteFstKaldi(std::cerr, false, tolerance_fst); + + fst::ArcSort(&tolerance_fst, fst::ILabelCompare()); + + delete ctx_dep; + delete trans_model; +} + +} // namespace chain +} // namespace kaldi + +int main(int argc, char *argv[]) { + using namespace kaldi; + SetVerboseLevel(2); + + const char *usage = "chain-supervision-test [options]"; + + ParseOptions po(usage); + + int32 num_phones = 1; + + po.Register("num-phones", &num_phones, + "Number of phones"); + + chain::SupervisionOptions sup_opts; + sup_opts.left_tolerance = 1; + sup_opts.right_tolerance = 1; + sup_opts.frame_subsampling_factor = 1; + sup_opts.lm_scale = 0.5; + + sup_opts.Register(&po); + + po.Read(argc, argv); + + sup_opts.left_tolerance = 1; + sup_opts.right_tolerance = 1; + kaldi::chain::TestToleranceFst(sup_opts, num_phones); + + sup_opts.left_tolerance = 0; + sup_opts.right_tolerance = 0; + kaldi::chain::TestToleranceFst(sup_opts, num_phones); + + return 0; + + for (int32 i = 0; i < 10; i++) { + kaldi::chain::ChainSupervisionSplitterTest(i); + } + //kaldi::chain::TestRanges(); +} diff --git a/src/chain/chain-supervision-splitter.cc b/src/chain/chain-supervision-splitter.cc new file mode 100644 index 00000000000..fc1ab65e630 --- /dev/null +++ b/src/chain/chain-supervision-splitter.cc @@ -0,0 +1,793 @@ +// chain/chain-supervision-splitter.cc + +// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2014-2015 Vimal Manohar +// 2017 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "chain/chain-supervision-splitter.h" +#include "chain/chain-supervision.h" +#include "lat/lattice-functions.h" + +namespace kaldi { +namespace chain { + +typedef fst::ArcTpl LatticeArc; +typedef fst::VectorFst Lattice; + +const int kSupervisionMaxStates = 200000; // we can later make this + // configurable if needed. + +void FstToLattice(const fst::StdVectorFst &fst, Lattice *lat) { + lat->DeleteStates(); + + int32 start_state = fst.Start(); + for (int32 i = 0; i < fst.NumStates(); i++) + lat->AddState(); + + lat->SetStart(start_state); + + for (fst::StdArc::StateId s = 0; s < fst.NumStates(); s++) { + for (fst::ArcIterator aiter(fst, s); + !aiter.Done(); aiter.Next()) { + const fst::StdArc &arc = aiter.Value(); + + LatticeWeight weight = LatticeWeight::One(); + weight.SetValue1(arc.weight.Value()); + + lat->AddArc(s, + LatticeArc(arc.ilabel, arc.olabel, weight, arc.nextstate)); + } + + if (fst.Final(s) != fst::TropicalWeight::Zero()) { + LatticeWeight weight = LatticeWeight::One(); + weight.SetValue1(fst.Final(s).Value()); + lat->SetFinal(s, weight); + } + } +} + +/** This function converts lattice to one with pdf_id + 1 as olabels. + This assumes that the ilabels are transition_ids. +*/ +void ConvertLatticeToPdfLabels( + const TransitionModel &tmodel, Lattice *lat) { + typedef LatticeArc::StateId StateId; + StateId num_states = lat->NumStates(); + for (StateId s = 0; s < num_states; s++) { + for (fst::MutableArcIterator iter(lat, s); + !iter.Done(); iter.Next()) { + LatticeArc arc = iter.Value(); + if (arc.ilabel == 0) + arc.olabel = 0; // epsilon arc + else + arc.olabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; // pdf + 1 + iter.SetValue(arc); + } + } +} + +bool LatticeToNumeratorPost(const Lattice &lat, + const TransitionModel &trans_model, + const fst::StdVectorFst &fst, + Posterior *post, std::string key) { + Lattice pdf_lat = lat; + ConvertLatticeToPdfLabels(trans_model, &pdf_lat); + + fst::Project(&pdf_lat, fst::PROJECT_OUTPUT); + fst::StdVectorFst sup_fst; + ConvertLattice(pdf_lat, &sup_fst); + + if (fst.NumStates() > 0) { + if (!AddWeightToFst(fst, &sup_fst)) { + if (!key.empty()) + KALDI_WARN << "For key " << key << ", "; + KALDI_WARN << "FST was empty after composing with FST. " + << "This should be extremely rare (a few per corpus, at most)"; + return false; + } + } + + // Convert fst to lattice to extract posterior using forward backward. + Lattice lat_copy; + ConvertFstToLattice(sup_fst, &lat_copy); + + kaldi::uint64 props = lat_copy.Properties(fst::kFstProperties, false); + if (!(props & fst::kTopSorted)) { + if (fst::TopSort(&lat_copy) == false) + KALDI_ERR << "Cycles detected in lattice."; + } + + LatticeForwardBackward(lat_copy, post); + return true; +} + +SupervisionLatticeSplitter::SupervisionLatticeSplitter( + const SupervisionLatticeSplitterOptions &opts, + const SupervisionOptions &sup_opts, + const TransitionModel &trans_model, + const fst::StdVectorFst &den_fst): + sup_opts_(sup_opts), opts_(opts), trans_model_(trans_model), + den_fst_(den_fst) { + + if (opts_.convert_to_unconstrained) { + KALDI_WARN << "--convert-to-unconstrained=true; " + << "--left-tolerance and --right-tolerance will be ignored."; + } else { + MakeToleranceEnforcerFst(); + } +} + +bool SupervisionLatticeSplitter::LoadLattice(const Lattice &lat) { + lat_ = lat; + + if (!PrepareLattice()) + return false; + + int32 num_states = lat_.NumStates(); + + KALDI_ASSERT(num_states > 0); // TODO: Might have to be skipped instead. + int32 start_state = lat_.Start(); + + // Lattice should be top-sorted and connected, so start-state must be 0. + KALDI_ASSERT(start_state == 0 && "Expecting start-state to be 0"); + + KALDI_ASSERT(num_states == lat_scores_.state_times.size()); + KALDI_ASSERT(lat_scores_.state_times[start_state] == 0); + return true; +} + +bool SupervisionLatticeSplitter::GetFrameRangeSupervision( + int32 begin_frame, int32 num_frames, + Supervision *supervision, + Lattice *out_lat, Lattice *raw_range_lat) const { + int32 end_frame = begin_frame + num_frames; + // Note: end_frame is not included in the range of frames that the + // output supervision object covers; it's one past the end. + KALDI_ASSERT(num_frames > 0 && begin_frame >= 0 && + begin_frame + num_frames <= lat_scores_.state_times.back()); + + Lattice lat_out; + CreateRangeLattice(begin_frame, end_frame, &lat_out); + + if (raw_range_lat) { + *raw_range_lat = lat_out; + } + + PostProcessLattice(&lat_out); + + if (out_lat) { + *out_lat = lat_out; + } + + if (den_fst_.NumStates() == 0) { + // Apply lm-scale on the lattice and remove the acoustic costs + ScaleLattice(fst::LatticeScale(sup_opts_.lm_scale, 0.0), &lat_out); + } else { + // Otherwise the lm_scale has already been applied. So just remove the + // acoustic costs. + ScaleLattice(fst::LatticeScale(1.0, 0.0), &lat_out); + } + + supervision->frames_per_sequence = num_frames; + return GetSupervision(lat_out, supervision); +} + +bool SupervisionLatticeSplitter::GetFrameRangeProtoSupervision( + const ContextDependencyInterface &ctx_dep, + const TransitionModel &trans_model, + int32 begin_frame, int32 num_frames, + ProtoSupervision *proto_supervision) const { + + int32 end_frame = begin_frame + num_frames; + // Note: end_frame is not included in the range of frames that the + // output supervision object covers; it's one past the end. + KALDI_ASSERT(num_frames > 0 && begin_frame >= 0 && + begin_frame + num_frames <= lat_scores_.state_times.back()); + + Lattice lat_out; + CreateRangeLattice(begin_frame, end_frame, &lat_out); + + PostProcessLattice(&lat_out); + + if (opts_.debug && GetVerboseLevel() > 2) { + WriteLattice(std::cerr, false, lat_out); + } + + CompactLattice clat_part; + ConvertLattice(lat_out, &clat_part); + + + return PhoneLatticeToProtoSupervision(sup_opts_, clat_part, + proto_supervision); +} + +void SupervisionLatticeSplitter::LatticeInfo::Check() const { + // Check if all the vectors are of size num_states + KALDI_ASSERT(state_times.size() == alpha.size() && + state_times.size() == beta.size()); + + // Check that the states are ordered in increasing order of state_times. + // This must be true since the states are in breadth-first search order. + KALDI_ASSERT(IsSorted(state_times)); + + KALDI_ASSERT(state_times.back() == num_frames); +} + +bool SupervisionLatticeSplitter::PrepareLattice() { + // Scale the lattice to appropriate acoustic scale. + KALDI_ASSERT(opts_.acoustic_scale != 0.0); + if (opts_.acoustic_scale != 1.0) + fst::ScaleLattice(fst::AcousticLatticeScale( + opts_.acoustic_scale), &lat_); + + if (den_fst_.NumStates() > 0) { + ScaleLattice(fst::GraphLatticeScale(sup_opts_.lm_scale), &lat_); + Lattice lat_out = lat_; + ConvertLatticeToPdfLabels(trans_model_, &lat_out); + // Now ilabel is transition-id, olabel is pdf-id+1. + // So we can compose the denominator fst on the right. + + // Note: den_fst_ is already scaled by 1.0 - lm_scale + Lattice den_lat; + FstToLattice(den_fst_, &den_lat); + fst::ArcSort(&den_lat, fst::ILabelCompare()); + + fst::Compose(lat_out, den_lat, &lat_); + // In lat_, ilabel is transition-id, olabel is pdf-id+1 + + if (lat_.NumStates() == 0) + return false; + } + + KALDI_ASSERT(fst::TopSort(&lat_)); + LatticeStateTimes(lat_, &(lat_scores_.state_times)); + int32 num_states = lat_.NumStates(); + std::vector > state_time_indexes(num_states); + for (int32 s = 0; s < num_states; s++) { + state_time_indexes[s] = std::make_pair(lat_scores_.state_times[s], s); + } + + // Order the states based on the state times. This is stronger than just + // topological sort. This is required by the lattice splitting code. + std::sort(state_time_indexes.begin(), state_time_indexes.end()); + + std::vector state_order(num_states); + for (int32 s = 0; s < num_states; s++) { + state_order[state_time_indexes[s].second] = s; + } + + fst::StateSort(&lat_, state_order); + ComputeLatticeScores(); + + return true; +} + +void SupervisionLatticeSplitter::CreateRangeLattice( + int32 begin_frame, int32 end_frame, + Lattice *out_lat) const { + typedef Lattice::StateId StateId; + typedef LatticeArc::Label Label; + + const std::vector &state_times = lat_scores_.state_times; + + // Some checks to ensure the lattice and scores are prepared properly + KALDI_ASSERT(state_times.size() == lat_.NumStates()); + if (!lat_.Properties(fst::kTopSorted, true)) + KALDI_ERR << "Input lattice must be topologically sorted."; + + std::vector::const_iterator begin_iter = + std::lower_bound(state_times.begin(), state_times.end(), begin_frame), + end_iter = std::lower_bound(begin_iter, + state_times.end(), end_frame); + + // begin_iter should point to the first state with time == begin_frame + KALDI_ASSERT(*begin_iter == begin_frame && + (begin_iter == state_times.begin() || + begin_iter[-1] < begin_frame)); + + // even if end_frame == supervision_.num_frames, there should be a state with + // that frame index. + KALDI_ASSERT(end_iter[-1] < end_frame && + (end_iter < state_times.end() || *end_iter == end_frame)); + + StateId begin_state = begin_iter - state_times.begin(), + end_state = end_iter - state_times.begin(); + + KALDI_ASSERT(end_state > begin_state); + out_lat->DeleteStates(); + out_lat->ReserveStates(end_state - begin_state + 2); + + // Add special start state + StateId start_state = out_lat->AddState(); + out_lat->SetStart(start_state); + + KALDI_ASSERT(out_lat->Start() == 0); + + for (StateId i = begin_state; i < end_state; i++) + out_lat->AddState(); + + // Add the special final-state. + StateId final_state = out_lat->AddState(); + out_lat->SetFinal(final_state, LatticeWeight::One()); + + for (StateId state = begin_state; state < end_state; state++) { + StateId output_state = state - begin_state + 1; + if (state_times[state] == begin_frame) { + // we'd like to make this an initial state, but OpenFst doesn't allow + // multiple initial states. Instead we add an epsilon transition to it + // from our actual initial state. The weight on this + // transition is the forward probability of the said 'initial state' + LatticeWeight weight = LatticeWeight::One(); + weight.SetValue1((opts_.normalize ? lat_scores_.beta[0] : 0.0) + - lat_scores_.alpha[state]); + // Add negative of the forward log-probability to the graph cost score, + // since the acoustic scores would be changed later. + // Assuming that the lattice is scaled with appropriate acoustic + // scale. + // We additionally normalize using the total lattice score. Since the + // same score is added as normalizer to all the paths in the lattice, + // the relative probabilities of the paths in the lattice is not affected. + // Note: Doing a forward-backward on this split must result in a total + // score of 0 because of the normalization. + + out_lat->AddArc(start_state, + LatticeArc(0, 0, weight, output_state)); + } else { + KALDI_ASSERT(lat_scores_.state_times[state] < end_frame); + } + for (fst::ArcIterator aiter(lat_, state); + !aiter.Done(); aiter.Next()) { + const LatticeArc &arc = aiter.Value(); + StateId nextstate = arc.nextstate; + if (nextstate >= end_state) { + // A transition to any state outside the range becomes a transition to + // our special final-state. + // The weight is just the negative of the backward log-probability + + // the arc cost. We again normalize with the total lattice score. + LatticeWeight weight; + //KALDI_ASSERT(lat_scores_.beta[state] < 0); + weight.SetValue1(arc.weight.Value1() - lat_scores_.beta[nextstate]); + weight.SetValue2(arc.weight.Value2()); + // Add negative of the backward log-probability to the LM score, since + // the acoustic scores would be changed later. + // Note: We don't normalize here because that is already done with the + // initial cost. + + out_lat->AddArc(output_state, + LatticeArc(arc.ilabel, arc.olabel, weight, final_state)); + } else { + StateId output_nextstate = nextstate - begin_state + 1; + + out_lat->AddArc(output_state, + LatticeArc(arc.ilabel, arc.olabel, arc.weight, output_nextstate)); + } + } + } + + KALDI_ASSERT(out_lat->Start() == 0); + + if (opts_.debug) { + Posterior post; + + Lattice &temp_lat(*out_lat); + //fst::RmEpsilon(&temp_lat); + fst::TopSort(&temp_lat); + + double like = LatticeForwardBackward(temp_lat, &post); + + KALDI_ASSERT(kaldi::ApproxEqual( + like + (opts_.normalize ? lat_scores_.beta[0] : 0.0), + lat_scores_.beta[0])); + + const Posterior &full_post = lat_scores_.post; + + for (int32 t = begin_frame; t < end_frame; t++) { + KALDI_ASSERT(full_post[t].size() == post[t - begin_frame].size()); + + for (int32 j = 0; j < full_post[t].size(); j++) { + KALDI_ASSERT(post[t - begin_frame][j].first == full_post[t][j].first); + if (post[t-begin_frame][j].second < 0.1) + continue; + if (!kaldi::ApproxEqual(post[t - begin_frame][j].second, + full_post[t][j].second)) { + WritePosterior(std::cerr, false, full_post); + WritePosterior(std::cerr, false, post); + + std::vector alphas; + std::vector betas; + ComputeLatticeAlphasAndBetas(temp_lat, false, &alphas, &betas); + + fst::StdVectorFst full_fst; + Lattice full_lat(lat_); + fst::ScaleLattice(fst::AcousticLatticeScale(0), &full_lat); + ConvertLattice(full_lat, &full_fst); + WriteFstKaldi(std::cerr, false, full_fst); + + fst::StdVectorFst split_fst; + fst::ScaleLattice(fst::AcousticLatticeScale(0), out_lat); + ConvertLattice(*out_lat, &split_fst); + WriteFstKaldi(std::cerr, false, split_fst); + + KALDI_ASSERT(false); + } + } + } + } +} + +void SupervisionLatticeSplitter::PostProcessLattice(Lattice *out_lat) const { + fst::RmEpsilon(out_lat); + + if (opts_.acoustic_scale != 1.0) { + fst::ScaleLattice(fst::AcousticLatticeScale( + 1.0 / opts_.acoustic_scale), out_lat); + } +} + +bool SupervisionLatticeSplitter::GetSupervision( + const Lattice &lat, Supervision *supervision) const { + fst::StdVectorFst transition_id_fst; + ConvertLattice(lat, &transition_id_fst); + Project(&transition_id_fst, fst::PROJECT_INPUT); // Keep only the transition-ids. + if (transition_id_fst.Properties(fst::kIEpsilons, true) != 0) { + // remove epsilons, if there are any. + fst::RmEpsilon(&transition_id_fst); + } + + KALDI_ASSERT(transition_id_fst.NumStates() > 0); + + if (opts_.convert_to_unconstrained) { + supervision->label_dim = trans_model_.NumTransitionIds(); + std::swap(transition_id_fst, supervision->fst); + return ConvertSupervisionToUnconstrained(trans_model_, supervision); + } else { + supervision->label_dim = trans_model_.NumPdfs(); + } + + fst::TableComposeOptions compose_opts; + compose_opts.table_match_type = fst::MATCH_INPUT; + + TableCompose(transition_id_fst, tolerance_fst_, &(supervision->fst), + compose_opts); + + fst::Connect(&(supervision->fst)); + + // at this point supervision->fst will have pdf-ids plus one as the olabels, + // but still transition-ids as the ilabels. Copy olabels to ilabels. + fst::Project(&(supervision->fst), fst::PROJECT_OUTPUT); + + fst::RmEpsilon(&(supervision->fst)); + fst::DeterminizeInLog(&(supervision->fst)); + + if (den_fst_.NumStates() > 0) { + TryDeterminizeMinimize(kSupervisionMaxStates, + &(supervision->fst)); + } + + if (opts_.debug) { + std::cerr << "tolerance added fst"; + fst::WriteFstKaldi(std::cerr, false, supervision->fst); + } + + KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0); + if (supervision->fst.NumStates() == 0) { + KALDI_WARN << "Supervision FST is empty (too many phones for too few " + << "frames?)"; + // possibly there were too many phones for too few frames. + return false; + } + + supervision->weight = 1.0; + supervision->num_sequences = 1; + supervision->label_dim = trans_model_.NumPdfs(); + if (!opts_.convert_to_unconstrained) + SortBreadthFirstSearch(&(supervision->fst)); + + return true; +} + +void SupervisionLatticeSplitter::ComputeLatticeScores() { + lat_scores_.Reset(); + lat_scores_.num_frames = LatticeStateTimes(lat_, &(lat_scores_.state_times)); + + if (opts_.debug) + LatticeForwardBackward(lat_, &(lat_scores_.post)); + + ComputeLatticeAlphasAndBetas(lat_, false, + &(lat_scores_.alpha), &(lat_scores_.beta)); + lat_scores_.Check(); + // This check will fail if the lattice is not breadth-first search sorted +} + +class ToleranceEnforcerFstCreator { + public: + ToleranceEnforcerFstCreator( + const SupervisionOptions &opts, const TransitionModel &trans_model, + fst::StdVectorFst *fst); + + void MakeFst(); + + private: + typedef fst::StdArc::Weight Weight; + typedef fst::StdArc::StateId StateId; + typedef fst::StdArc::Label Label; + + enum StateType { + kInit, + kDeletion, + kAccept, + kInsertion + }; + + inline int32 GetStateId(int32 offset, int32 forward_id, int32 type) { + return ((offset + zero_offset_index_) * (num_forward_transitions_ * 3 + 1) + + (type == kInit ? 0 : 1 + (type - 1) * num_forward_transitions_ + + forward_id) + 1); + } + + void AddArcsForOffset(int32 offset); + void AddArcsForForwardTransition(int32 offset, int32 forward_id, int32 trans_id); + void AddArcsBetweenOffsets(int32 offset, int32 forward_id, int32 trans_id); + + const SupervisionOptions &opts_; + const TransitionModel &trans_model_; + + int32 num_forward_transitions_; // number of forward transitions in the + // transition model + int32 num_offsets_; // number of offsets (tolerances) + + // The index corresponding to the zero offset. + // offset_index = offset + zero_offset_index_ + int32 zero_offset_index_; + + fst::StdVectorFst *fst_; +}; + +ToleranceEnforcerFstCreator::ToleranceEnforcerFstCreator( + const SupervisionOptions &opts, const TransitionModel &trans_model, + fst::StdVectorFst *fst): + opts_(opts), trans_model_(trans_model), fst_(fst) { + + num_forward_transitions_ = 0; + for (int32 trans_id = 1; trans_id <= trans_model_.NumTransitionIds(); + trans_id++) { + if (!trans_model_.IsSelfLoop(trans_id)) { + num_forward_transitions_++; + } + } + num_offsets_ = opts_.left_tolerance + opts_.right_tolerance + 1; + zero_offset_index_ = opts_.left_tolerance; + + fst_->DeleteStates(); +} + +void ToleranceEnforcerFstCreator::AddArcsForForwardTransition( + int32 offset, int32 forward_id, int32 trans_id) { + StateId init_state = GetStateId(offset, forward_id, kInit); + + if (offset == 0 && forward_id == 0) + fst_->SetFinal(init_state, fst::TropicalWeight::One()); + + // We expect this is to be a forward transition + KALDI_ASSERT(!trans_model_.IsSelfLoop(trans_id)); + int32 forward_pdf = trans_model_.TransitionIdToPdf(trans_id); + int32 tstate = trans_model_.TransitionIdToTransitionState(trans_id); + int32 self_loop_tid = trans_model_.SelfLoopOf(tstate); + int32 self_loop_pdf = trans_model_.TransitionIdToPdf(self_loop_tid); + + // self-loop accepting forward-tid + fst_->AddArc(init_state, + fst::StdArc(trans_id, forward_pdf + 1, + fst::TropicalWeight::One(), + init_state)); + + for (int32 i = 1; i <= 3; i++) { + StateId next_state = GetStateId(offset, forward_id, i); + + if (i == kDeletion || i == kInsertion) { + // epsilon-arc to initial state + fst_->AddArc(next_state, + fst::StdArc(0, 0, + fst::TropicalWeight::One(), + init_state)); + } + + if (i == kAccept) { + // accept a forward transition from initial state + fst_->AddArc(init_state, + fst::StdArc(trans_id, forward_pdf + 1, + fst::TropicalWeight::One(), + next_state)); + + // self-loop accepting self-loop tid + fst_->AddArc(next_state, + fst::StdArc(self_loop_tid, self_loop_pdf + 1, + fst::TropicalWeight::One(), + next_state)); + + // self-loop transition back to initial state + fst_->AddArc(next_state, + fst::StdArc(self_loop_tid, self_loop_pdf + 1, + fst::TropicalWeight::One(), + init_state)); + } + } +} + +void ToleranceEnforcerFstCreator::AddArcsBetweenOffsets( + int32 offset, int32 forward_id, int32 trans_id) { + // We expect this is to be a forward transition + KALDI_ASSERT(!trans_model_.IsSelfLoop(trans_id)); + int32 tstate = trans_model_.TransitionIdToTransitionState(trans_id); + int32 self_loop_tid = trans_model_.SelfLoopOf(tstate); + int32 self_loop_pdf = trans_model_.TransitionIdToPdf(self_loop_tid); + + if (offset > -opts_.left_tolerance) { + StateId accept_state = GetStateId(offset, forward_id, kAccept); + StateId delete_state = GetStateId(offset, forward_id, kDeletion); + StateId next_state = GetStateId(offset - 1, forward_id, kDeletion); + + fst_->AddArc(accept_state, + fst::StdArc(self_loop_tid, 0, + fst::TropicalWeight::One(), + next_state)); + fst_->AddArc(delete_state, + fst::StdArc(self_loop_tid, 0, + fst::TropicalWeight::One(), + next_state)); + } + + if (offset < opts_.right_tolerance) { + StateId accept_state = GetStateId(offset, forward_id, kAccept); + StateId insert_state = GetStateId(offset, forward_id, kInsertion); + StateId next_state = GetStateId(offset + 1, forward_id, kInsertion); + + fst_->AddArc(accept_state, + fst::StdArc(0, self_loop_pdf + 1, + fst::TropicalWeight::One(), + next_state)); + + fst_->AddArc(insert_state, + fst::StdArc(0, self_loop_pdf + 1, + fst::TropicalWeight::One(), + next_state)); + } + + if (offset == 0) { + if (forward_id == 0) { + // Add arc from start state to the offset 0 initial state. + // This is the normal case when there is no partial phone in the lattice. + StateId init_state = GetStateId(offset, forward_id, kInit); + fst_->AddArc(0, fst::StdArc(0, 0, + fst::TropicalWeight::One(), + init_state)); + } + + // Add self-loop on start state accepting the self-loop transition + // of a partial phone. + fst_->AddArc(0, fst::StdArc(self_loop_tid, self_loop_pdf + 1, + fst::TropicalWeight::One(), + 0)); + + if (offset > -opts_.left_tolerance) { + // Add arc from start state deleting a self-loop transition + StateId next_state = GetStateId(offset - 1, forward_id, kDeletion); + fst_->AddArc(0, fst::StdArc(self_loop_tid, 0, + fst::TropicalWeight::One(), + next_state)); + } + } +} + +void ToleranceEnforcerFstCreator::AddArcsForOffset(int32 offset) { + int32 forward_id = 0; + for (int32 trans_id = 1; trans_id <= trans_model_.NumTransitionIds(); + trans_id++) { + if (!trans_model_.IsSelfLoop(trans_id)) { + AddArcsForForwardTransition(offset, forward_id, trans_id); + AddArcsBetweenOffsets(offset, forward_id, trans_id); + forward_id++; + } + } +} + +void ToleranceEnforcerFstCreator::MakeFst() { + int32 num_states = num_offsets_ * (3 * num_forward_transitions_ + 1) + 1; + fst_->ReserveStates(num_states); + + for (int32 s = 0; s < num_states; s++) + fst_->AddState(); + + fst_->SetStart(0); + + for (int32 o = -opts_.left_tolerance; o <= opts_.right_tolerance; o++) { + AddArcsForOffset(o); + } + + if (GetVerboseLevel() > 3) { WriteFstKaldi(std::cerr, false, *fst_); } + + fst::Connect(fst_); + fst::ArcSort(fst_, fst::ILabelCompare()); +} + +void SupervisionLatticeSplitter::MakeToleranceEnforcerFst() { + GetToleranceEnforcerFst(sup_opts_, trans_model_, &tolerance_fst_); +} + +void GetToleranceEnforcerFst(const SupervisionOptions &sup_opts, + const TransitionModel &trans_model, + fst::StdVectorFst *tolerance_fst) { + ToleranceEnforcerFstCreator creator(sup_opts, trans_model, tolerance_fst); + creator.MakeFst(); +} + +/* +bool PhoneLatticeToSupervision(const fst::StdVectorFst &tolerance_fst, + const TransitionModel &trans_model, + const Lattice &lat, + chain::Supervision *supervision, + bool debug) { + fst::StdVectorFst transition_id_fst; + ConvertLattice(lat, &transition_id_fst); + Project(&transition_id_fst, fst::PROJECT_INPUT); // Keep only the transition-ids. + if (transition_id_fst.Properties(fst::kIEpsilons, true) != 0) { + // remove epsilons, if there are any. + fst::RmEpsilon(&transition_id_fst); + } + KALDI_ASSERT(transition_id_fst.NumStates() > 0); + + fst::TableComposeOptions compose_opts; + compose_opts.table_match_type = fst::MATCH_INPUT; + + TableCompose(transition_id_fst, tolerance_fst, &(supervision->fst), + compose_opts); + fst::Connect(&(supervision->fst)); + + if (debug) { + fst::Project(&(supervision->fst), fst::PROJECT_OUTPUT); + fst::RmEpsilon(&(supervision->fst)); + + return true; + } + + // at this point supervision->fst will have pdf-ids plus one as the olabels, + // but still transition-ids as the ilabels. Copy olabels to ilabels. + fst::Project(&(supervision->fst), fst::PROJECT_OUTPUT); + + fst::RmEpsilon(&(supervision->fst)); + fst::DeterminizeInLog(&(supervision->fst)); + + KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0); + if (supervision->fst.NumStates() == 0) { + KALDI_WARN << "Supervision FST is empty (too many phones for too few " + << "frames?)"; + // possibly there were too many phones for too few frames. + return false; + } + + supervision->weight = 1.0; + supervision->num_sequences = 1; + supervision->frames_per_sequence = 0; + supervision->label_dim = trans_model.NumPdfs(); + SortBreadthFirstSearch(&(supervision->fst)); + return true; +} +*/ + +} // end namespace chain +} // end namespace kaldi diff --git a/src/chain/chain-supervision-splitter.h b/src/chain/chain-supervision-splitter.h new file mode 100644 index 00000000000..6d74fc1a8ef --- /dev/null +++ b/src/chain/chain-supervision-splitter.h @@ -0,0 +1,164 @@ +// chain/chain-supervision-splitter.h + +// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2014-2015 Vimal Manohar +// 2017 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_CHAIN_CHAIN_SUPERVISION_SPILTTER_H_ +#define KALDI_CHAIN_CHAIN_SUPERVISION_SPILTTER_H_ + +#include "hmm/transition-model.h" +#include "lat/kaldi-lattice.h" +#include "chain/chain-supervision.h" + +namespace kaldi { +namespace chain { + +typedef fst::ArcTpl LatticeArc; +typedef fst::VectorFst Lattice; + +struct SupervisionLatticeSplitterOptions { + BaseFloat acoustic_scale; + bool normalize; + bool convert_to_unconstrained; + bool debug; + + SupervisionLatticeSplitterOptions(): + acoustic_scale(1.0), normalize(true), + convert_to_unconstrained(false), debug(false) { } + + void Register(OptionsItf *opts) { + opts->Register("acoustic-scale", &acoustic_scale, + "Apply acoustic scale on the lattices before splitting."); + opts->Register("normalize", &normalize, + "Normalize the initial and final scores added to split " + "lattices"); + opts->Register("convert-to-unconstrained", &convert_to_unconstrained, + "If this is true, then self-loop transitions in the " + "supervision are replaced by self-loops"); + opts->Register("debug", &debug, + "Run some debug test codes"); + } +}; + +class SupervisionLatticeSplitter { + public: + SupervisionLatticeSplitter(const SupervisionLatticeSplitterOptions &opts, + const SupervisionOptions &sup_opts, + const TransitionModel &trans_model, + const fst::StdVectorFst &den_fst); + + bool LoadLattice(const Lattice &lat); + + bool GetFrameRangeSupervision(int32 begin_frame, int32 frames_per_sequence, + chain::Supervision *supervision, + Lattice *lat = NULL, + Lattice *raw_range_lat = NULL) const; + + bool GetFrameRangeProtoSupervision( + const ContextDependencyInterface &ctx_dep, + const TransitionModel &trans_model, + int32 begin_frame, int32 num_frames, + ProtoSupervision *proto_supervision) const; + + int32 NumFrames() const { return lat_scores_.num_frames; } + + // A structure used to store the forward and backward scores + // and state times of a lattice + struct LatticeInfo { + // These values are stored in log. + std::vector alpha; + std::vector beta; + std::vector state_times; + std::vector > > post; + int32 num_frames; + + void Reset() { + alpha.clear(); + beta.clear(); + state_times.clear(); + post.clear(); + } + + void Check() const; + }; + + const Lattice& GetLattice() const { return lat_; } + + const fst::StdVectorFst& ToleranceFst() const { return tolerance_fst_; } + private: + // Creates an output lattice covering frames begin_frame <= t < end_frame, + // assuming that the corresponding state-range that we need to + // include, begin_state <= s < end_state has been included. + // (note: the output lattice will also have two special initial and final + // states). + void CreateRangeLattice(int32 begin_frame, int32 end_frame, + Lattice *out_lat) const; + + void PostProcessLattice(Lattice *out_lat) const; + + bool GetSupervision(const Lattice &out_lat, Supervision *supervision) const; + + // Function to compute lattice scores for a lattice + void ComputeLatticeScores(); + + // Prepare lattice : + // 1) Order states in breadth-first search order + // 2) Compute states times, which must be a strictly non-decreasing vector + // 3) Compute lattice alpha and beta scores + bool PrepareLattice(); + + const SupervisionOptions &sup_opts_; + + const SupervisionLatticeSplitterOptions &opts_; + + const TransitionModel &trans_model_; + + fst::StdVectorFst tolerance_fst_; + void MakeToleranceEnforcerFst(); + + // Copy of the lattice loaded using LoadLattice(). + // This is required because the lattice states + // need to be ordered in breadth-first search order. + Lattice lat_; + + // LatticeInfo object for lattice. + // This will be computed when PrepareLattice function is called. + LatticeInfo lat_scores_; + + fst::StdVectorFst den_fst_; +}; + +void GetToleranceEnforcerFst(const SupervisionOptions &opts, const TransitionModel &trans_model, fst::StdVectorFst *tolerance_fst); + +bool PhoneLatticeToSupervision(const fst::StdVectorFst &tolerance_fst, + const TransitionModel &trans_model, + const Lattice &lat, + chain::Supervision *supervision, + bool debug = false); + +bool LatticeToNumeratorPost(const Lattice &lat, + const TransitionModel &trans_model, + const fst::StdVectorFst &fst, + Posterior *post, + std::string key = ""); + +} +} + +#endif // KALDI_CHAIN_CHAIN_SUPERVISION_SPLITTER_H_ diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index 7ee5ee117b0..658eca62340 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -26,7 +26,7 @@ #include "chain/chain-den-graph.h" #include "chain/chain-denominator.h" #include "hmm/hmm-utils.h" - +#include namespace kaldi { @@ -104,7 +104,7 @@ void TestSupervisionNumerator(const Supervision &supervision) { CuMatrix nnet_output_deriv(nnet_output.NumRows(), nnet_output.NumCols()); - num.Backward(&nnet_output_deriv); + num.Backward(1.0, &nnet_output_deriv); int32 dim = 3; Vector predicted_objf_changes(dim), @@ -332,6 +332,134 @@ void ChainTrainingTest(const DenominatorGraph &den_graph, } } +void PrintMatrix(const CuMatrixBase &mat) { + std::cerr << " [ "; + for (int32 i = 0; i < mat.NumRows(); i++) { + for (int32 j = 0; j < mat.NumCols(); j++) { + std::cerr << mat(i, j) << " "; + } + std::cerr << "\n"; + } + std::cerr << " ] "; +} + + +void ChainSmbrTrainingTest(const DenominatorGraph &den_graph, + const Supervision &supervision) { + int32 num_sequences = supervision.num_sequences, + frames_per_sequence = supervision.frames_per_sequence; + if (frames_per_sequence == 1) // this will break some code. + return; + + CuMatrix nnet_output(num_sequences * frames_per_sequence, + den_graph.NumPdfs()); + + bool zero_output = (RandInt(0, 3) == 0); + if (!zero_output) + nnet_output.SetRandn(); + + ChainTrainingOptions opts; + if (RandInt(0, 1) == 1) + opts.leaky_hmm_coefficient = 0.2; + opts.leaky_hmm_coefficient = 0.1; + + { + KALDI_LOG << "LF-MMI training"; + BaseFloat objf, l2_term, weight; + CuMatrix nnet_output_deriv(nnet_output.NumRows(), + nnet_output.NumCols(), + kUndefined); + ComputeChainObjfAndDeriv(opts, den_graph, supervision, + nnet_output, &objf, &l2_term, &weight, + &nnet_output_deriv); + } + + CuMatrix nnet_output_deriv(nnet_output.NumRows(), + nnet_output.NumCols(), + kUndefined); + KALDI_LOG << "LF-SMBR training"; + opts.use_smbr_objective = true; + opts.mmi_factor = 0.0; + opts.smbr_factor = 1.0; + BaseFloat objf, mmi_objf = 0.0, l2_term, weight; + ComputeChainSmbrObjfAndDeriv(opts, den_graph, supervision, + nnet_output, &objf, &mmi_objf, &l2_term, &weight, + &nnet_output_deriv); + + { + // make sure each row of nnet_output_deriv sums to one (shift invariance of + // the nnet output). + CuVector nnet_output_deriv_row_sums(nnet_output_deriv.NumRows()); + nnet_output_deriv_row_sums.AddColSumMat(1.0, nnet_output_deriv, 0.0); + KALDI_ASSERT(nnet_output_deriv_row_sums.Norm(2.0) < 0.1); + } + + KALDI_LOG << "Chain objf per frame is " << (objf / weight) + << " over " << weight << " frames (weighted)"; + + { // a check + BaseFloat output_deriv_sum = nnet_output_deriv.Sum(); + KALDI_LOG << "Sum of nnet-output-deriv is " << output_deriv_sum + << " vs. expected 0."; + KALDI_ASSERT(output_deriv_sum < 0.2); + } + + int32 num_tries = 5; + BaseFloat epsilon = 1.0e-04; + Vector predicted_objf_changes(num_tries), + observed_objf_changes(num_tries); + for (int32 p = 0; p < num_tries; p++) { + CuMatrix nnet_delta_output(nnet_output.NumRows(), + nnet_output.NumCols()); + nnet_delta_output.SetRandn(); + nnet_delta_output.Scale(epsilon); + predicted_objf_changes(p) = TraceMatMat(nnet_output_deriv, + nnet_delta_output, kTrans); + CuMatrix nnet_output_perturbed(nnet_delta_output); + nnet_output_perturbed.AddMat(1.0, nnet_output); + + BaseFloat objf_modified, mmi_objf_modified, l2_term_modified, weight_modified; + + ComputeChainSmbrObjfAndDeriv(opts, den_graph, supervision, + nnet_output_perturbed, + &objf_modified, &mmi_objf_modified, &l2_term_modified, + &weight_modified, + NULL); + + observed_objf_changes(p) = objf_modified - objf; + } + KALDI_LOG << "Predicted objf changes are " << predicted_objf_changes; + KALDI_LOG << "Observed objf changes are " << observed_objf_changes; + { + Vector error(predicted_objf_changes); + error.AddVec(-1.0, observed_objf_changes); + KALDI_LOG << "num-sequences = " << num_sequences << ", frames-per-sequence = " + << frames_per_sequence << ", relative accuracy is " + << (error.Norm(2.0) / predicted_objf_changes.Norm(2.0)); + } + + { + // we get inaccuracy for long segments, I think because there is a bias when we + // add random noise for it to increase the likelihood (for winner-take-all reasons) + // and for long utterances this bias adds up over the frames and tends to + // outweigh the random component that the gradient predicts (which will tend to + // cancel). Try to correct for this... + BaseFloat correction = (predicted_objf_changes.Sum() - observed_objf_changes.Sum()) / + predicted_objf_changes.Dim(); + observed_objf_changes.Add(correction); + KALDI_LOG << "Correcting observed objf changes for statistical effects, to " + << observed_objf_changes; + if (frames_per_sequence > 2 && + predicted_objf_changes.Norm(2.0) > 0.1 * epsilon) { + // if we only have the initial and final frames, due to the scaling-down + // of pdfs not in the numerator sequence the derivative might be zero, + // which would cause problems doing the comparison. + // note, epsilon = 1.0e-04. + KALDI_ASSERT(predicted_objf_changes.ApproxEqual(observed_objf_changes, 0.25)); + } + } +} + void TestSupervisionSplitting(const ContextDependency &ctx_dep, const TransitionModel &trans_model, const Supervision &supervision) { @@ -534,6 +662,75 @@ void ChainSupervisionTest() { delete trans_model; } +void ChainSupervisionSimpleTest() { + ContextDependency *ctx_dep; + TransitionModel *trans_model = GenRandTransitionModel(&ctx_dep); + const std::vector &phones = trans_model->GetPhones(); + + int32 subsample_factor = 3; + + int32 phone_sequence_length = 2; + std::vector > phones_durations(phone_sequence_length); + + CompactLattice clat; + int32 cur_state = clat.AddState(); + clat.SetStart(cur_state); + + for (int32 i = 0; i < phone_sequence_length; i++) { + int32 phone = phones[RandInt(0, phones.size() - 1)]; + int32 min_length = trans_model->GetTopo().MinLength(phone), + headroom = 5, + duration = RandInt(subsample_factor * min_length, + subsample_factor * min_length + headroom); + phones_durations[i].first = phone; + phones_durations[i].second = duration; + int32 next_state = clat.AddState(); + std::vector ones(duration, 1); + clat.AddArc(cur_state, + CompactLatticeArc(phone, phone, + CompactLatticeWeight(LatticeWeight::One(), + ones), next_state)); + cur_state = next_state; + } + clat.SetFinal(cur_state, CompactLatticeWeight::One()); + ProtoSupervision proto_sup1, proto_sup2; + SupervisionOptions opts; + opts.frame_subsampling_factor = subsample_factor; + bool ans1 = AlignmentToProtoSupervision(opts, phones_durations, &proto_sup1), + ans2 = PhoneLatticeToProtoSupervision(opts, clat, &proto_sup2); + KALDI_ASSERT(ans1 && ans2); + KALDI_ASSERT(proto_sup1 == proto_sup2); + + Supervision supervision; + if (!ProtoSupervisionToSupervision(*ctx_dep, *trans_model, + proto_sup1, &supervision)) { + // we shouldn't fail because we multiplied by + // 'subsample_factor' when creating the duration. + KALDI_ERR << "Failed creating supervision."; + } + supervision.Check(*trans_model); + TestSupervisionIo(supervision); + TestSupervisionSplitting(*ctx_dep, *trans_model, supervision); + TestSupervisionAppend(*trans_model, supervision); + + { + fst::StdVectorFst den_fst; + ComputeExampleDenFst(*ctx_dep, *trans_model, &den_fst); + DenominatorGraph den_graph(den_fst, trans_model->NumPdfs()); + ChainDenominatorTest(den_graph); + fst::StdVectorFst normalization_fst; + den_graph.GetNormalizationFst(den_fst, &normalization_fst); + // add the weight to the numerator FST so we can assert objf <= 0. + bool ans = AddWeightToSupervisionFst(normalization_fst, &supervision); + KALDI_ASSERT(ans); + // TODO: still have to test for appended sequences. + ChainSmbrTrainingTest(den_graph, supervision); + } + + delete ctx_dep; + delete trans_model; +} + void AddArc(int32 from, int32 to, fst::StdVectorFst *fst) { fst->AddArc(from, fst::StdArc(0, 0, fst::TropicalWeight::One(), to)); @@ -605,7 +802,7 @@ void TestRanges() { int main() { using namespace kaldi; - SetVerboseLevel(1); + SetVerboseLevel(2); #if HAVE_CUDA == 1 int32 loop = 0; for (loop = 0; loop < 2; loop++) { @@ -615,8 +812,9 @@ int main() { else CuDevice::Instantiate().SelectGpuId("yes"); #endif - for (int32 i = 0; i < 3; i++) { - kaldi::chain::ChainSupervisionTest(); + for (int32 i = 0; i < 6; i++) { + if (i % 2 == 0) kaldi::chain::ChainSupervisionTest(); + else kaldi::chain::ChainSupervisionSimpleTest(); kaldi::chain::BreadthFirstTest(); } kaldi::chain::TestRanges(); diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 8f95034c437..656d1ada433 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -74,9 +74,14 @@ void ProtoSupervision::Write(std::ostream &os, bool binary) const { void SupervisionOptions::Check() const { KALDI_ASSERT(left_tolerance >= 0 && right_tolerance >= 0 && frame_subsampling_factor > 0 && - left_tolerance + right_tolerance + 1 >= frame_subsampling_factor); + (left_tolerance + right_tolerance + 1 >= frame_subsampling_factor || (left_tolerance == 0 && right_tolerance == 0))); KALDI_ASSERT(lm_scale >= 0.0 && lm_scale < 1.0); + + if (!silence_phones_str.empty()) { + KALDI_ASSERT(left_tolerance_silence >= 0 && right_tolerance_silence >= 0 && + left_tolerance_silence + right_tolerance_silence + 1 >= frame_subsampling_factor); + } } bool AlignmentToProtoSupervision(const SupervisionOptions &opts, @@ -149,6 +154,16 @@ bool PhoneLatticeToProtoSupervisionInternal( const CompactLattice &lat, ProtoSupervision *proto_supervision) { opts.Check(); + + ConstIntegerSet silence_set; + if (!opts.silence_phones_str.empty()) { + std::vector silence_phones; + if (!SplitStringToIntegers(opts.silence_phones_str, ":,", false, + &silence_phones)) + KALDI_ERR << "Invalid silence-phones string " << opts.silence_phones_str; + silence_set.Init(silence_phones); + } + if (lat.NumStates() == 0) { KALDI_WARN << "Empty lattice provided"; return false; @@ -176,20 +191,29 @@ bool PhoneLatticeToProtoSupervisionInternal( int32 phone = lat_arc.ilabel; // It's an acceptor so ilabel == ollabel. if (phone == 0) { KALDI_WARN << "CompactLattice has epsilon arc. Unexpected."; - return false; + continue; } proto_supervision->fst.AddArc(state, fst::StdArc(phone, phone, fst::TropicalWeight( lat_arc.weight.Weight().Value1() - * opts.lm_scale), + * opts.lm_scale + opts.phone_ins_penalty), lat_arc.nextstate)); - int32 t_begin = std::max(0, (state_time - opts.left_tolerance)), + int32 left_tolerance = opts.left_tolerance; + int32 right_tolerance = opts.right_tolerance; + if (!opts.silence_phones_str.empty()) { + if (silence_set.count(phone) > 0) { + left_tolerance = opts.left_tolerance_silence; + right_tolerance = opts.right_tolerance_silence; + } + } + + int32 t_begin = std::max(0, (state_time - left_tolerance)), t_end = std::min(num_frames, - (next_state_time + opts.right_tolerance)), - t_begin_subsampled = (t_begin + factor - 1)/ factor, - t_end_subsampled = (t_end + factor - 1)/ factor; + (next_state_time + right_tolerance)), + t_begin_subsampled = (t_begin + factor - 1)/ factor, + t_end_subsampled = (t_end + factor - 1)/ factor; for (int32 t_subsampled = t_begin_subsampled; t_subsampled < t_end_subsampled; t_subsampled++) proto_supervision->allowed_phones[t_subsampled].push_back(phone); @@ -211,9 +235,11 @@ bool PhoneLatticeToProtoSupervisionInternal( KALDI_ASSERT(!proto_supervision->allowed_phones[t_subsampled].empty()); SortAndUniq(&(proto_supervision->allowed_phones[t_subsampled])); } + return true; } + bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, const CompactLattice &lat, ProtoSupervision *proto_supervision) { @@ -583,6 +609,10 @@ void Supervision::Write(std::ostream &os, bool binary) const { } WriteToken(os, binary, ""); } + if (numerator_post_targets.NumRows() > 0) { + WriteToken(os, binary, ""); + numerator_post_targets.Write(os, binary); + } if (!alignment_pdfs.empty()) { WriteToken(os, binary, ""); WriteIntegerVector(os, binary, alignment_pdfs); @@ -597,6 +627,7 @@ void Supervision::Swap(Supervision *other) { std::swap(label_dim, other->label_dim); std::swap(fst, other->fst); std::swap(e2e_fsts, other->e2e_fsts); + std::swap(numerator_post_targets, other->numerator_post_targets); std::swap(alignment_pdfs, other->alignment_pdfs); } @@ -610,10 +641,21 @@ void Supervision::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &frames_per_sequence); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &label_dim); - bool e2e; - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &e2e); + bool e2e = false; + if (PeekToken(is, binary) == 'E') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &e2e); + } if (!e2e) { + if (PeekToken(is, binary) == 'N') { + ExpectToken(is, binary, ""); + numerator_post_targets.Read(is, binary); + if (PeekToken(is, binary) == 'N') { + ExpectToken(is, binary, ""); + BaseFloat temp; + ReadBasicType(is, binary, &temp); + } + } if (!binary) { ReadFstKaldi(is, binary, &fst); } else { @@ -643,6 +685,10 @@ void Supervision::Read(std::istream &is, bool binary) { } ExpectToken(is, binary, ""); } + if (PeekToken(is, binary) == 'N') { + ExpectToken(is, binary, ""); + numerator_post_targets.Read(is, binary); + } if (PeekToken(is, binary) == 'A') { ExpectToken(is, binary, ""); ReadIntegerVector(is, binary, &alignment_pdfs); @@ -702,7 +748,43 @@ Supervision::Supervision(const Supervision &other): weight(other.weight), num_sequences(other.num_sequences), frames_per_sequence(other.frames_per_sequence), label_dim(other.label_dim), fst(other.fst), - e2e_fsts(other.e2e_fsts), alignment_pdfs(other.alignment_pdfs) { } + e2e_fsts(other.e2e_fsts), alignment_pdfs(other.alignment_pdfs), + numerator_post_targets(other.numerator_post_targets) { } + + +// This static function merges the numerator posterior targets in +// input supervision objects and puts it in output supervision. +// This will be called only when the input supervision has +// numerator posterior targets. +void AppendSupervisionPost(const std::vector &input, + Supervision *output_supervision) { + KALDI_ASSERT(!input.empty()); + int32 label_dim = input[0]->label_dim, + num_inputs = input.size(); + KALDI_ASSERT(num_inputs > 1); + KALDI_ASSERT(input[0]->numerator_post_targets.NumRows() > 0); + + KALDI_ASSERT(output_supervision->num_sequences == num_inputs); + + std::vector output_targets(num_inputs); + output_targets[0] = &(input[0]->numerator_post_targets); + + for (int32 i = 1; i < num_inputs; i++) { + output_targets[i] = &(input[i]->numerator_post_targets); + KALDI_ASSERT(output_targets[i]->NumRows() > 0); + KALDI_ASSERT(output_targets[i]->NumCols() == label_dim); + KALDI_ASSERT(input[i]->frames_per_sequence == + output_supervision->frames_per_sequence); + } + + AppendGeneralMatrixRows( + output_targets, &(output_supervision->numerator_post_targets), + true); // sort by t + KALDI_ASSERT(output_supervision->numerator_post_targets.NumRows() + == output_supervision->frames_per_sequence + * output_supervision->num_sequences); + KALDI_ASSERT(output_supervision->frames_per_sequence * output_supervision->num_sequences == output_supervision->numerator_post_targets.NumRows()); +} // This static function is called by MergeSupervision if the supervisions @@ -725,6 +807,15 @@ void MergeSupervisionE2e(const std::vector &input, output_supervision->alignment_pdfs.clear(); // The program nnet3-chain-acc-lda-stats works on un-merged egs, // and there is no need to support merging of 'alignment_pdfs' + + if (input[0]->numerator_post_targets.NumRows() > 0) { + AppendSupervisionPost(input, output_supervision); + KALDI_VLOG(2) << output_supervision->frames_per_sequence << " * " + << output_supervision->num_sequences << " == " + << output_supervision->numerator_post_targets.NumRows(); + + KALDI_ASSERT(output_supervision->frames_per_sequence * output_supervision->num_sequences == output_supervision->numerator_post_targets.NumRows()); + } } void MergeSupervision(const std::vector &input, @@ -732,6 +823,7 @@ void MergeSupervision(const std::vector &input, KALDI_ASSERT(!input.empty()); int32 label_dim = input[0]->label_dim, num_inputs = input.size(); + KALDI_ASSERT(label_dim > 0); if (num_inputs == 1) { *output_supervision = *(input[0]); return; @@ -760,12 +852,21 @@ void MergeSupervision(const std::vector &input, } else { KALDI_ERR << "Mismatch weight or frames_per_sequence between inputs"; } - } + fst::StdVectorFst &out_fst = output_supervision->fst; // The process of concatenation will have introduced epsilons. fst::RmEpsilon(&out_fst); SortBreadthFirstSearch(&out_fst); + + if (input[0]->numerator_post_targets.NumRows() > 0) { + AppendSupervisionPost(input, output_supervision); + KALDI_VLOG(2) << output_supervision->frames_per_sequence << " * " + << output_supervision->num_sequences << " == " + << output_supervision->numerator_post_targets.NumRows(); + + KALDI_ASSERT(output_supervision->frames_per_sequence * output_supervision->num_sequences == output_supervision->numerator_post_targets.NumRows()); + } } // This static function is called by AddWeightToSupervisionFst if the supervision @@ -797,14 +898,11 @@ bool AddWeightToSupervisionFstE2e(const fst::StdVectorFst &normalization_fst, return true; } -bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, - Supervision *supervision) { - if (!supervision->e2e_fsts.empty()) - return AddWeightToSupervisionFstE2e(normalization_fst, supervision); - +bool AddWeightToFst(const fst::StdVectorFst &normalization_fst, + fst::StdVectorFst *supervision_fst) { // remove epsilons before composing. 'normalization_fst' has noepsilons so // the composed result will be epsilon free. - fst::StdVectorFst supervision_fst_noeps(supervision->fst); + fst::StdVectorFst supervision_fst_noeps(*supervision_fst); fst::RmEpsilon(&supervision_fst_noeps); if (!TryDeterminizeMinimize(kSupervisionMaxStates, &supervision_fst_noeps)) { @@ -817,8 +915,10 @@ bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, fst::StdVectorFst composed_fst; fst::Compose(supervision_fst_noeps, normalization_fst, &composed_fst); - if (composed_fst.NumStates() == 0) + if (composed_fst.NumStates() == 0) { + KALDI_WARN << "FST empty after composing with normalization FST."; return false; + } // projection should not be necessary, as both FSTs are acceptors. // determinize and minimize to make it as compact as possible. @@ -827,15 +927,23 @@ bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, KALDI_WARN << "Failed to determinize normalized supervision fst"; return false; } - supervision->fst = composed_fst; - + *supervision_fst = composed_fst; // Make sure the states are numbered in increasing order of time. - SortBreadthFirstSearch(&(supervision->fst)); - KALDI_ASSERT(supervision->fst.Properties(fst::kAcceptor, true) == fst::kAcceptor); - KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0); + SortBreadthFirstSearch(supervision_fst); + KALDI_ASSERT(supervision_fst->Properties(fst::kAcceptor, true) == fst::kAcceptor); + KALDI_ASSERT(supervision_fst->Properties(fst::kIEpsilons, true) == 0); return true; } + +bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, + Supervision *supervision) { + if (!supervision->e2e_fsts.empty()) + return AddWeightToSupervisionFstE2e(normalization_fst, supervision); + return AddWeightToFst(normalization_fst, &(supervision->fst)); +} + + void SplitIntoRanges(int32 num_frames, int32 frames_per_range, std::vector *range_starts) { diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index f1a796dc2f8..83c5f782cdb 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -30,6 +30,7 @@ #include "lat/kaldi-lattice.h" #include "fstext/deterministic-fst.h" #include "hmm/transition-model.h" +#include "hmm/posterior.h" namespace kaldi { namespace chain { @@ -53,13 +54,20 @@ struct SupervisionOptions { BaseFloat weight; BaseFloat lm_scale; bool convert_to_pdfs; + BaseFloat phone_ins_penalty; + int32 left_tolerance_silence; + int32 right_tolerance_silence; + std::string silence_phones_str; SupervisionOptions(): left_tolerance(5), right_tolerance(5), frame_subsampling_factor(1), weight(1.0), lm_scale(0.0), - convert_to_pdfs(true) { } + convert_to_pdfs(true), + phone_ins_penalty(0.0), + left_tolerance_silence(0), + right_tolerance_silence(0) { } void Register(OptionsItf *opts) { opts->Register("left-tolerance", &left_tolerance, "Left tolerance for " @@ -80,11 +88,23 @@ struct SupervisionOptions { "supervision fst."); opts->Register("convert-to-pdfs", &convert_to_pdfs, "If true, convert " "transition-ids to pdf-ids + 1 in supervision FSTs."); + opts->Register("phone-ins-penalty", &phone_ins_penalty, + "The penalty to penalize longer paths"); + opts->Register("left-tolerance-silence", &left_tolerance_silence, "Left tolerance for " + "shift in silence phone position relative to the alignment"); + opts->Register("right-tolerance-silence", &right_tolerance_silence, "Right tolerance for " + "shift in silence phone position relative to the alignment"); + opts->Register("silence-phones", &silence_phones_str, + "A comma separated list of silence phones"); } void Check() const; }; +bool TryDeterminizeMinimize(int32 supervision_max_states, + fst::StdVectorFst *supervision_fst); + + // This is the form that the supervision information for 'chain' models takes // we compile it to Supervision. // The normal compilation sequence is: @@ -275,7 +295,6 @@ struct Supervision { // chunk. [Code location TBD]. std::vector e2e_fsts; - // This member is only set to a nonempty value if we are creating 'unconstrained' // egs. These are egs that are split into chunks using the lattice alignments, // but then within the chunks we remove the frame-level constraints on which @@ -286,9 +305,13 @@ struct Supervision { // it will only be present for un-merged egs. std::vector alignment_pdfs; + GeneralMatrix numerator_post_targets; + Supervision(): weight(1.0), num_sequences(1), frames_per_sequence(-1), label_dim(-1) { } + Supervision(int32 dim, const Posterior &labels); + Supervision(const Supervision &other); void Swap(Supervision *other); @@ -400,6 +423,9 @@ class SupervisionSplitter { /// This function also removes epsilons and makes sure supervision->fst has the /// required sorting of states. Think of it as the final stage in preparation /// of the supervision FST. +bool AddWeightToFst(const fst::StdVectorFst &normalization_fst, + fst::StdVectorFst *supervision_fst); + bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, Supervision *supervision); diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 6b4a7b593c2..0a3e011120b 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -23,12 +23,13 @@ #include "chain/chain-numerator.h" #include "chain/chain-generic-numerator.h" #include "chain/chain-denominator.h" +#include "chain/chain-denominator-smbr.h" +#include "hmm/posterior.h" namespace kaldi { namespace chain { - void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const Supervision &supervision, @@ -137,6 +138,101 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, } } +void ComputeChainDenominatorObjfAndDeriv(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const CuMatrixBase &nnet_output, + BaseFloat supervision_weight, int32 num_sequences, + BaseFloat *objf, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv) { + CuMatrix deriv; + + if (nnet_output_deriv) + KALDI_ASSERT(nnet_output.NumRows() == nnet_output_deriv->NumRows() + && nnet_output.NumCols() == nnet_output_deriv->NumCols()); + + if (xent_output_deriv) { + KALDI_ASSERT(nnet_output.NumRows() == xent_output_deriv->NumRows() + && nnet_output.NumCols() == xent_output_deriv->NumCols()); + } + + if (xent_output_deriv != NULL || nnet_output_deriv != NULL) + deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols()); + + BaseFloat logprob_weighted; + bool ok = true; + + { + DenominatorComputation denominator(opts, den_graph, + num_sequences, + nnet_output); + + logprob_weighted = supervision_weight * denominator.Forward(); + if (nnet_output_deriv) + ok = denominator.Backward(supervision_weight * opts.kl_factor, + &deriv); + } + + int32 frames_per_sequence = nnet_output.NumRows() / num_sequences; + + *objf = logprob_weighted; + *weight = supervision_weight * num_sequences * frames_per_sequence; + if (!((*objf) - (*objf) == 0) || !ok) { + // inf or NaN detected, or denominator computation returned false. + if (nnet_output_deriv) + nnet_output_deriv->SetZero(); + if (xent_output_deriv) + xent_output_deriv->SetZero(); + BaseFloat default_objf = -10; + KALDI_WARN << "Objective function is " << (*objf) + << " and denominator computation (if done) returned " + << std::boolalpha << ok + << ", setting objective function to " << default_objf + << " per frame."; + *objf = default_objf * *weight; + } else { + if (xent_output_deriv) { + // the reason for kStrideEqualNumCols is so that we can share the memory + // block with the memory that was used for exp_nnet_output_transposed_ from + // chain-denominator.cc, which has just been freed; it also uses the + // kStrideEqualNumCols arg (its shape is the transpose of this matrix's + // shape). + xent_output_deriv->AddMat(1.0, deriv); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(1.0, deriv); + } else if (nnet_output_deriv) { + nnet_output_deriv->AddMat(1.0, deriv); + } + } + + // This code helps us see how big the derivatives are, on average, + // for different frames of the sequences. As expected, they are + // smaller towards the edges of the sequences (due to the penalization + // of 'incorrect' pdf-ids. + if (GetVerboseLevel() >= 1 && nnet_output_deriv != NULL && RandInt(0, 10) == 0) { + int32 tot_frames = nnet_output_deriv->NumRows(); + CuVector row_products(tot_frames); + row_products.AddDiagMat2(1.0, deriv, kNoTrans, 0.0); + Vector row_products_cpu(row_products); + Vector row_products_per_frame(frames_per_sequence); + for (int32 i = 0; i < tot_frames; i++) + row_products_per_frame(i / num_sequences) += row_products_cpu(i); + KALDI_LOG << "Derivs per frame are " << row_products_per_frame; + } +} + +void ComputeChainNumeratorPost(const Supervision &supervision, + const CuMatrixBase &nnet_output, + CuMatrixBase *numerator_post) { + KALDI_ASSERT(supervision.weight == 1.0); + KALDI_ASSERT(numerator_post->NumRows() == nnet_output.NumRows() && + numerator_post->NumCols() == nnet_output.NumCols()); + NumeratorComputation numerator(supervision, nnet_output); + numerator.Forward(); + numerator_post->SetZero(); + numerator.Backward(1.0, numerator_post); +} void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, @@ -166,9 +262,10 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, supervision.num_sequences, nnet_output); - den_logprob_weighted = supervision.weight * denominator.Forward(); + den_logprob_weighted = supervision.weight * + (opts.mmi_factor + opts.kl_factor) * denominator.Forward(); if (nnet_output_deriv) - ok = denominator.Backward(-supervision.weight, + ok = denominator.Backward(-supervision.weight * (opts.mmi_factor + opts.kl_factor), nnet_output_deriv); } @@ -182,22 +279,36 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, kSetZero, kStrideEqualNumCols); } - - { + if (opts.mmi_factor > 0.0) { NumeratorComputation numerator(supervision, nnet_output); // note: supervision.weight is included as a factor in the derivative from // the numerator object, as well as the returned logprob. - num_logprob_weighted = numerator.Forward(); + num_logprob_weighted = opts.mmi_factor * numerator.Forward(); if (xent_output_deriv) { - numerator.Backward(xent_output_deriv); + numerator.Backward(opts.mmi_factor, xent_output_deriv); if (nnet_output_deriv) nnet_output_deriv->AddMat(1.0, *xent_output_deriv); } else if (nnet_output_deriv) { - numerator.Backward(nnet_output_deriv); + numerator.Backward(opts.mmi_factor, nnet_output_deriv); } } + if (opts.kl_factor > 0.0) { + CuMatrix numerator_post(nnet_output.NumRows(), nnet_output.NumCols()); + supervision.numerator_post_targets.CopyToMat(&numerator_post); + if (xent_output_deriv) { + xent_output_deriv->AddMat(supervision.weight * opts.kl_factor, numerator_post); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(supervision.weight * opts.kl_factor, numerator_post); + } else if (nnet_output_deriv) { + nnet_output_deriv->AddMat(supervision.weight * opts.kl_factor, numerator_post); + } + + num_logprob_weighted += supervision.weight * opts.kl_factor * + TraceMatMat(nnet_output, numerator_post, kTrans); + } + *objf = num_logprob_weighted - den_logprob_weighted; *weight = supervision.weight * supervision.num_sequences * supervision.frames_per_sequence; @@ -235,12 +346,194 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, if (opts.l2_regularize == 0.0) { *l2_term = 0.0; + } else if (!opts.norm_regularize) { + // compute the l2 penalty term and its derivative + BaseFloat scale = supervision.weight * opts.l2_regularize; + *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(-1.0 * scale, nnet_output); } else { + // compute the l2 penalty term and its derivative + BaseFloat scale = supervision.weight * opts.l2_regularize; + CuMatrix exp_nnet_output(nnet_output); + exp_nnet_output.ApplyExp(); + *l2_term = -scale * exp_nnet_output.Sum(); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(-1.0 * scale, exp_nnet_output); + } +} + +void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const Supervision &supervision, + const CuMatrixBase &nnet_output, + BaseFloat *objf, + BaseFloat *mmi_objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrix *xent_output_deriv, + const CuArray *sil_indices) { + // numerator_post is a matrix of size + // (num_sequences * frames_per_sequence) x num_pdfs and is ordered in the + // same way as nnet_output is i.e. + // first the first frame of each sequence, then the second frame of + // each sequence, and so on. + CuMatrix numerator_post(nnet_output.NumRows(), + nnet_output.NumCols()); + + BaseFloat num_logprob_weighted; + { + NumeratorComputation numerator(supervision, nnet_output); + // note: supervision.weight is included as a factor in the derivative from + // the numerator object, and the logprob too. + num_logprob_weighted = (opts.mmi_factor + opts.ml_factor) * numerator.Forward(); + numerator.Backward(1.0, &numerator_post); +#if HAVE_CUDA == 1 + if (!CuDevice::Instantiate().Enabled()) +#endif + { // Debugging + if (GetVerboseLevel() >= 2) { + Posterior post(numerator_post.NumRows()); + for (int32 i = 0; i < numerator_post.NumRows(); i++) { + CuSubVector row(numerator_post, i); + for (int32 j = 0; j < row.Dim(); j++) { + BaseFloat p = row(j); + if (p >= 0.01) { + post[i].push_back(std::make_pair(j, p)); + } + } + } + PosteriorHolder::Write(KALDI_LOG, false, post); + } + } + + if (nnet_output_deriv && (opts.mmi_factor != 0.0 || opts.ml_factor != 0.0)) { + nnet_output_deriv->CopyFromMat(numerator_post); + nnet_output_deriv->Scale(opts.mmi_factor + opts.ml_factor); + } + + if (xent_output_deriv) { + xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols()); + xent_output_deriv->CopyFromMat(numerator_post); + } + } + + if (opts.smbr_use_numerator_post_targets && + supervision.numerator_post_targets.NumRows() > 0) { + supervision.numerator_post_targets.CopyToMat(&numerator_post); + } + + if (opts.smbr_threshold > 0) { + KALDI_ASSERT(opts.smbr_threshold > 1.0 / nnet_output.NumCols()); + + // Consider all posteriors below smbr_threshold to be 0. + CuMatrix tmp(numerator_post); + tmp.Add(-opts.smbr_threshold); + tmp.ApplyHeaviside(); + numerator_post.MulElements(tmp); + + CuVector normalizer(nnet_output.NumRows()); + normalizer.AddColSumMat(1.0, numerator_post); + normalizer.Add(1e-8); + numerator_post.DivRowsVec(normalizer); + } + + if (sil_indices && opts.exclude_silence) { + // Exclude numerator posteriors for silence pdfs from accuracy + // computation. This is done by setting silence pdf posteriors to zero. + // sil_indices is expected to have -1 at the indexes corresponding to + // silence pdfs, and "i" for any other index "i". + numerator_post.CopyCols(numerator_post, *sil_indices); + } else if (sil_indices && opts.one_silence_class) { + // Create a copy with only the silence pdf posteriors. + CuMatrix silence_post(nnet_output.NumRows(), + nnet_output.NumCols()); + silence_post.CopyCols(numerator_post, *sil_indices); + + // Sum the posteriors of silence pdfs to get posterior of silence class. + CuVector total_silence_post(nnet_output.NumRows()); + total_silence_post.AddColSumMat(1.0, silence_post, 0.0); + + // Copy the silence class posterior to the columns of the silence pdfs. + numerator_post.CopyColsFromVec(total_silence_post, *sil_indices); + } + + DenominatorSmbrComputation denominator(opts, den_graph, + supervision.num_sequences, + nnet_output, numerator_post); + + BaseFloat den_logprob_negated; + BaseFloat smbr_objf = denominator.ForwardSmbr(&den_logprob_negated); + + //if (opts.mmi_factor != 0.0) { + // DenominatorComputation denominator_mmi(opts, den_graph, + // supervision.num_sequences, + // nnet_output); + // KALDI_ASSERT(kaldi::ApproxEqual(-den_logprob_negated, opts.mmi_factor * denominator_mmi.Forward())); + //} + + bool ok = true; + if (nnet_output_deriv) { + if (opts.mmi_factor == 0.0 && opts.ml_factor == 0.0) nnet_output_deriv->SetZero(); + ok = denominator.BackwardSmbr(supervision.weight, nnet_output_deriv); + } + + *objf = supervision.weight * smbr_objf; + *mmi_objf = supervision.weight * den_logprob_negated + num_logprob_weighted; + *weight = supervision.weight * supervision.num_sequences * + supervision.frames_per_sequence; + + BaseFloat total_objf = *objf + *mmi_objf; + if (!((total_objf) - (total_objf) == 0) || !ok) { + // inf or NaN detected, or denominator computation returned false. + if (nnet_output_deriv) + nnet_output_deriv->SetZero(); + if (xent_output_deriv) + xent_output_deriv->SetZero(); + BaseFloat default_objf = -(opts.mmi_factor + opts.ml_factor) * 10; + KALDI_WARN << "Objective function is " << (total_objf) + << " and denominator computation (if done) returned " + << std::boolalpha << ok + << ", setting objective function to " << default_objf + << " per frame."; + *mmi_objf = default_objf * *weight; + *objf = 0.0; + } + + // This code helps us see how big the derivatives are, on average, + // for different frames of the sequences. As expected, they are + // smaller towards the edges of the sequences (due to the penalization + // of 'incorrect' pdf-ids. + if (GetVerboseLevel() >= 1 && nnet_output_deriv != NULL) { + int32 tot_frames = nnet_output_deriv->NumRows(), + frames_per_sequence = supervision.frames_per_sequence, + num_sequences = supervision.num_sequences; + CuVector row_products(tot_frames); + row_products.AddDiagMat2(1.0, *nnet_output_deriv, kNoTrans, 0.0); + Vector row_products_cpu(row_products); + Vector row_products_per_frame(frames_per_sequence); + for (int32 i = 0; i < tot_frames; i++) + row_products_per_frame(i / num_sequences) += row_products_cpu(i); + KALDI_LOG << "Derivs per frame are " << row_products_per_frame; + } + + if (opts.l2_regularize == 0.0) { + *l2_term = 0.0; + } else if (!opts.norm_regularize) { // compute the l2 penalty term and its derivative BaseFloat scale = supervision.weight * opts.l2_regularize; *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans); if (nnet_output_deriv) nnet_output_deriv->AddMat(-1.0 * scale, nnet_output); + } else { + // compute the l2 penalty term and its derivative + BaseFloat scale = supervision.weight * opts.l2_regularize; + CuMatrix exp_nnet_output(nnet_output); + exp_nnet_output.ApplyExp(); + *l2_term = -scale * exp_nnet_output.Sum(); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(-1.0 * scale, exp_nnet_output); } } diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 6ea70b5ca41..83a49bdfa09 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -56,19 +56,48 @@ struct ChainTrainingOptions { // Note: we generally set leaky_hmm_coefficient to 0.1. BaseFloat leaky_hmm_coefficient; - // Cross-entropy regularization constant. (e.g. try 0.1). If nonzero, // the network is expected to have an output named 'output-xent', which // should have a softmax as its final nonlinearity. BaseFloat xent_regularize; + bool use_smbr_objective; + bool exclude_silence; + bool one_silence_class; + + std::string silence_pdfs_str; + + BaseFloat mmi_factor; + BaseFloat ml_factor; + BaseFloat kl_factor; + BaseFloat smbr_factor; + BaseFloat smbr_threshold; + + bool self_kl; + bool norm_regularize; + + BaseFloat smbr_leaky_hmm_coefficient; + bool smbr_use_numerator_post_targets; + + std::string smbr_factors_str, mmi_factors_str, ml_factors_str, kl_factors_str; + ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), - xent_regularize(0.0) { } + xent_regularize(0.0), use_smbr_objective(false), + exclude_silence(false), one_silence_class(false), + mmi_factor(1.0), ml_factor(0.0), kl_factor(0.0), + smbr_factor(0.0), smbr_threshold(0.0), self_kl(false), + norm_regularize(false), + smbr_leaky_hmm_coefficient(-1), + smbr_use_numerator_post_targets(false) { } void Register(OptionsItf *opts) { opts->Register("l2-regularize", &l2_regularize, "l2 regularization " "constant for 'chain' training, applied to the output " "of the neural net."); + opts->Register("norm-regularize", &norm_regularize, + "If true, then use l1 regularization on exponential of the " + "output of the neural net. Tends to make the " + "exp(output) small and more like probabilities."); opts->Register("leaky-hmm-coefficient", &leaky_hmm_coefficient, "Coefficient " "that allows transitions from each HMM state to each other " "HMM state, to ensure gradual forgetting of context (can " @@ -79,6 +108,47 @@ struct ChainTrainingOptions { "nonzero, the network is expected to have an output " "named 'output-xent', which should have a softmax as " "its final nonlinearity."); + opts->Register("use-smbr-objective", &use_smbr_objective, + "Use SMBR objective instead of MMI"); + opts->Register("silence-pdfs", &silence_pdfs_str, + "A comma-separated list of silence pdfs. " + "It makes sense only when the silence pdfs are " + "context-independent."); + opts->Register("mmi-factor", &mmi_factor, + "When using smbr objective, interpolate mmi objective " + "with this weight"); + opts->Register("ml-factor", &ml_factor, + "When using smbr objective, interpolate ml objective " + "with this weight"); + opts->Register("smbr-factor", &smbr_factor, + "When using smbr objective, interpolate smbr objective " + "with this weight"); + opts->Register("exclude-silence", &exclude_silence, + "Exclude numerator posteriors " + "of silence pdfs from accuracy computation in " + "sMBR training. --silence-pdfs is required if " + "this option is true."); + opts->Register("one-silence-class", &one_silence_class, + "Treat all silence pdfs as a single class for accuracy " + "computation in smBR training. --silence-pdfs is required " + "if this options is true."); + opts->Register("smbr-threshold", &smbr_threshold, + "Posterior below this value is considered 0"); + opts->Register("smbr-factors", &smbr_factors_str, + "SMBR factors for each output"); + opts->Register("mmi-factors", &mmi_factors_str, + "MMI factors for each output"); + opts->Register("ml-factors", &ml_factors_str, + "ML factors for each output"); + opts->Register("kl-factors", &kl_factors_str, + "KL factors for each output"); + opts->Register("smbr-leaky-hmm-coefficient", &smbr_leaky_hmm_coefficient, + "leaky-hmm-coefficient for LF-sMBR training. If not " + "provided, will use --leaky-hmm-coefficient instead."); + opts->Register("smbr-use-numerator-post-targets", + &smbr_use_numerator_post_targets, + "Use numerator posterior targets for computing " + "SMBR per-frame accuracies."); } }; @@ -126,7 +196,64 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, CuMatrixBase *nnet_output_deriv, CuMatrix *xent_output_deriv = NULL); +void ComputeChainNumeratorPost(const Supervision &supervision, + const CuMatrixBase &nnet_output, + CuMatrixBase *numerator_post); + +/** + This function does both the numerator and denominator parts of the 'chain' + smbr computation in one call. + + @param [in] opts Struct containing options + @param [in] den_graph The denominator graph, derived from denominator fst. + @param [in] supervision The supervision object, containing the supervision + paths and constraints on the alignment as an FST + @param [in] nnet_output The output of the neural net; dimension must equal + ((supervision.num_sequences * supervision.frames_per_sequence) by + den_graph.NumPdfs()). The rows are ordered as: all sequences + for frame 0; all sequences for frame 1; etc. + @param [out] objf The smbr objective function computed for this + example; you'll want to divide it by 'tot_weight' before + displaying it. + @param [out] l2_term The l2 regularization term in the objective function, if + the --l2-regularize option is used. To be added to 'o + @param [out] weight The weight to normalize the objective function by; + equals supervision.weight * supervision.num_sequences * + supervision.frames_per_sequence. + @param [out] nnet_output_deriv The derivative of the objective function w.r.t. + the neural-net output. Only written to if non-NULL. + You don't have to zero this before passing to this function, + we zero it internally. + @param [out] xent_output_deriv If non-NULL, then the numerator part of the derivative + (which equals a posterior from the numerator forward-backward, + scaled by the supervision weight) is written to here. This will + be used in the cross-entropy regularization code. This value + is also used in computing the cross-entropy objective value. +*/ +void ComputeChainSmbrObjfAndDeriv( + const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const Supervision &supervision, + const CuMatrixBase &nnet_output, + BaseFloat *objf, BaseFloat *mmi_objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrix *xent_output_deriv = NULL, + const CuArray *sil_indices = NULL); +/** + This function uses supervision as numerator and does denominator computation. + It can be uses, where numerator is fixed e.g. TS learning. +*/ +void ComputeChainDenominatorObjfAndDeriv(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const CuMatrixBase &nnet_output, + BaseFloat supervision_weight, int32 num_sequences, + BaseFloat *objf, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv = NULL); } // namespace chain } // namespace kaldi diff --git a/src/chain/language-model.cc b/src/chain/language-model.cc index 41e06116ea8..c8900726b00 100644 --- a/src/chain/language-model.cc +++ b/src/chain/language-model.cc @@ -52,7 +52,7 @@ void LanguageModelEstimator::IncrementCount(const std::vector &history, if (lm_states_[lm_state_index].tot_count == 0) { num_active_lm_states_++; } - lm_states_[lm_state_index].AddCount(next_phone, 1); + lm_states_[lm_state_index].AddCount(next_phone, 1.0); } void LanguageModelEstimator::SetParentCounts() { diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile index 61f653f174f..9fbac324a59 100644 --- a/src/chainbin/Makefile +++ b/src/chainbin/Makefile @@ -11,8 +11,11 @@ BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \ nnet3-chain-shuffle-egs nnet3-chain-subset-egs \ nnet3-chain-acc-lda-stats nnet3-chain-train nnet3-chain-compute-prob \ nnet3-chain-combine nnet3-chain-normalize-egs \ - nnet3-chain-e2e-get-egs nnet3-chain-compute-post - + nnet3-chain-e2e-get-egs nnet3-chain-compute-post \ + nnet3-chain-split-and-get-egs chain-split-lattices \ + nnet3-chain-split-convert-and-get-egs \ + chain-lattice-to-post chain-fst-to-post \ + nnet3-chain-compute-numerator-post OBJFILES = diff --git a/src/chainbin/chain-lattice-to-post.cc b/src/chainbin/chain-lattice-to-post.cc new file mode 100644 index 00000000000..d07dd8fef1f --- /dev/null +++ b/src/chainbin/chain-lattice-to-post.cc @@ -0,0 +1,130 @@ +// chainbin/chain-lattice-to-post.cc + +// Copyright 2017 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "hmm/posterior.h" +#include "lat/lattice-functions.h" +#include "chain/chain-supervision.h" +#include "chain/chain-supervision-splitter.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::chain; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Do forward-backward and collect pdf posteriors over lattices.\n" + "The labels are converted to a 1-index i.e. pdf-id + 1\n" + "An FST with labels as the 1-indexed pdf-ids can be optionally " + "provided to interpolate with the LM scores from lattice.\n" + "Usage: chain-lattice-to-post [options] [] " + "\n" + "\n"; + + BaseFloat acoustic_scale = 1.0, fst_scale = 0.0; + + ParseOptions po(usage); + po.Register("acoustic-scale", &acoustic_scale, + "Scaling factor for acoustic likelihoods"); + po.Register("fst-scale", &fst_scale, + "Scaling factor for the that will interpolated " + "with the lattice." + "Effectively this is (1-fst_scale) * lattice-graph-cost + fst_scale * fst-costs"); + po.Read(argc, argv); + + if (po.NumArgs() != 3 && po.NumArgs() != 4) { + po.PrintUsage(); + exit(1); + } + + std::string trans_model_rxfilename, + lattice_rspecifier, + fst_rxfilename, + post_wspecifier; + + if (po.NumArgs() == 3) { + trans_model_rxfilename = po.GetArg(1); + lattice_rspecifier = po.GetArg(2); + post_wspecifier = po.GetArg(3); + } else { + fst_rxfilename = po.GetArg(1); + trans_model_rxfilename = po.GetArg(2); + lattice_rspecifier = po.GetArg(3); + post_wspecifier = po.GetArg(4); + } + + TransitionModel trans_model; + ReadKaldiObject(trans_model_rxfilename, &trans_model); + + fst::StdVectorFst fst; + if (!fst_rxfilename.empty()) { + ReadFstKaldi(fst_rxfilename, &fst); + KALDI_ASSERT(fst.NumStates() > 0); + + if (fst_scale < 0.0 || fst_scale > 1.0) { + KALDI_ERR << "Invalid fst-scale; must be in [0.0, 1.0)"; + } + + if (fst_scale != 1.0) { + fst::ApplyProbabilityScale(fst_scale, &fst); + } + } + + fst::RmEpsilon(&fst); + fst::ArcSort(&fst, fst::ILabelCompare()); + + SequentialLatticeReader lattice_reader(lattice_rspecifier); + PosteriorWriter posterior_writer(post_wspecifier); + + int32 num_done = 0, num_fail = 0; + for (; !lattice_reader.Done(); lattice_reader.Next()) { + std::string key = lattice_reader.Key(); + + Lattice lat = lattice_reader.Value(); + + fst::ScaleLattice(fst::LatticeScale(1.0 - fst_scale, acoustic_scale), &lat); + + Posterior graph_post; + bool status = LatticeToNumeratorPost(lat, trans_model, fst, + &graph_post, key); + if (!status) { + num_fail++; + continue; + } + + posterior_writer.Write(key, graph_post); + num_done++; + } + + KALDI_LOG << "Converted " << num_done << " lattices to posteriors; " + << "failed for " << num_fail; + + return num_done > 0 ? 0 : 1; + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/chainbin/chain-split-lattices.cc b/src/chainbin/chain-split-lattices.cc new file mode 100644 index 00000000000..32fc54345a7 --- /dev/null +++ b/src/chainbin/chain-split-lattices.cc @@ -0,0 +1,175 @@ +// chainbin/chain-split-lattices.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "hmm/posterior.h" +#include "chain/chain-supervision-splitter.h" +#include "lat/lattice-functions.h" +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-example-utils.h" +#include "fstext/kaldi-fst-io.h" + +namespace kaldi { +namespace nnet3 { + + +/** + This function does all the processing for one utterance, and outputs the + supervision objects to 'example_writer'. Note: if normalization_fst is the + empty FST (with no states), it skips the final stage of egs preparation and + you should do it later with nnet3-chain-normalize-egs. +*/ + +static bool ProcessFile(const chain::SupervisionLatticeSplitter &sup_lat_splitter, + const std::string &utt_id, + UtteranceSplitter *utt_splitter, + TableWriter *fst_writer, + LatticeWriter *lat_writer) { + std::vector state_times; + + int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; + int32 num_frames = sup_lat_splitter.NumFrames() * frame_subsampling_factor; + + std::vector chunks; + + utt_splitter->GetChunksForUtterance(num_frames, &chunks); + + if (chunks.empty()) { + KALDI_WARN << "Not producing egs for utterance " << utt_id + << " because it is too short: " + << num_frames << " frames."; + return false; + } + + for (size_t c = 0; c < chunks.size(); c++) { + ChunkTimeInfo &chunk = chunks[c]; + + int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor, + num_frames_subsampled = chunk.num_frames / frame_subsampling_factor; + + Lattice lat_part; + chain::Supervision supervision_part; + sup_lat_splitter.GetFrameRangeSupervision(start_frame_subsampled, + num_frames_subsampled, + &supervision_part, + &lat_part); + + std::ostringstream oss; + oss << utt_id << "-" << start_frame_subsampled << "-" << num_frames_subsampled; + std::string key = oss.str(); + + fst_writer->Write(key, supervision_part.fst); + lat_writer->Write(key, lat_part); + } + return true; +} + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Split lattices to chain supervision FSTs\n" + "\n" + "Usage: chain-split-lattices [options] " + " []\n"; + + ExampleGenerationConfig eg_config; // controls num-frames, + // left/right-context, etc. + chain::SupervisionOptions sup_opts; + + int32 srand_seed = 0; + + ParseOptions po(usage); + po.Register("srand", &srand_seed, "Seed for random number generator "); + + eg_config.Register(&po); + + ParseOptions supervision_opts("supervision", &po); + sup_opts.Register(&supervision_opts); + + chain::SupervisionLatticeSplitterOptions sup_lat_splitter_opts; + sup_lat_splitter_opts.Register(&po); + + po.Read(argc, argv); + + srand(srand_seed); + + if (po.NumArgs() != 3 && po.NumArgs() != 4) { + po.PrintUsage(); + exit(1); + } + + std::string + trans_model_rxfilename, + lattice_rspecifier, fst_wspecifier; + trans_model_rxfilename = po.GetArg(1); + lattice_rspecifier = po.GetArg(2); + fst_wspecifier = po.GetArg(3); + + std::string lattice_wspecifier = po.GetOptArg(4); + + eg_config.ComputeDerived(); + UtteranceSplitter utt_splitter(eg_config); + + TransitionModel trans_model; + ReadKaldiObject(trans_model_rxfilename, &trans_model); + + SequentialLatticeReader lattice_reader(lattice_rspecifier); + TableWriter fst_writer(fst_wspecifier); + LatticeWriter lattice_writer(lattice_wspecifier); + + int32 num_err = 0; + + fst::StdVectorFst den_fst; + chain::SupervisionLatticeSplitter sup_lat_splitter( + sup_lat_splitter_opts, sup_opts, trans_model, den_fst); + + for (; !lattice_reader.Done(); lattice_reader.Next()) { + std::string key = lattice_reader.Key(); + const Lattice &lat = lattice_reader.Value(); + + sup_lat_splitter.LoadLattice(lat); + if (!ProcessFile(sup_lat_splitter, + key, &utt_splitter, &fst_writer, + &lattice_writer)) + num_err++; + } + if (num_err > 0) + KALDI_WARN << num_err << " utterances had errors and could " + "not be processed."; + // utt_splitter prints stats in its destructor. + return utt_splitter.ExitStatus(); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + diff --git a/src/chainbin/nnet3-chain-acc-lda-stats.cc b/src/chainbin/nnet3-chain-acc-lda-stats.cc index 693eb2dad86..7f118c0ec66 100644 --- a/src/chainbin/nnet3-chain-acc-lda-stats.cc +++ b/src/chainbin/nnet3-chain-acc-lda-stats.cc @@ -92,20 +92,31 @@ class NnetChainLdaStatsAccumulator { const fst::StdVectorFst &fst = supervision.fst; - Lattice lat; - // convert the FST to a lattice, putting all the weight on - // the graph weight. This is to save us having to implement the - // forward-backward on FSTs. - ConvertFstToLattice(fst, &lat); Posterior post; - LatticeForwardBackward(lat, &post); - KALDI_ASSERT(post.size() == static_cast(num_frames)); - - // Subtract one, to convert the (pdf-id + 1) which appears in the - // supervision FST, to a pdf-id. - for (size_t i = 0; i < post.size(); i++) - for (size_t j = 0; j < post[i].size(); j++) - post[i][j].first--; + if (supervision.numerator_post_targets.NumRows() > 0) { + const SparseMatrix &labels = supervision.numerator_post_targets.GetSparseMatrix(); + post.resize(labels.NumRows()); + for (size_t i = 0; i < labels.NumRows(); i++) { + post[i].resize(labels.Row(i).NumElements()); + for (size_t j = 0; j < labels.Row(i).NumElements(); j++) { + post[i][j] = labels.Row(i).GetElement(j); + } + } + } else { + Lattice lat; + // convert the FST to a lattice, putting all the weight on + // the graph weight. This is to save us having to implement the + // forward-backward on FSTs. + ConvertFstToLattice(fst, &lat); + LatticeForwardBackward(lat, &post); + KALDI_ASSERT(post.size() == static_cast(num_frames)); + + // Subtract one, to convert the (pdf-id + 1) which appears in the + // supervision FST, to a pdf-id. + for (size_t i = 0; i < post.size(); i++) + for (size_t j = 0; j < post[i].size(); j++) + post[i][j].first--; + } if (lda_stats_.Dim() == 0) lda_stats_.Init(num_pdfs, diff --git a/src/chainbin/nnet3-chain-compute-numerator-post.cc b/src/chainbin/nnet3-chain-compute-numerator-post.cc new file mode 100644 index 00000000000..3d1b0fb4a00 --- /dev/null +++ b/src/chainbin/nnet3-chain-compute-numerator-post.cc @@ -0,0 +1,161 @@ +// nnet3bin/nnet3-chain-compute-numerator-post.cc + +// Copyright 2018 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3/nnet-chain-diagnostics.h" +#include "nnet3/nnet-utils.h" + +namespace kaldi { +namespace nnet3 { + +void ProcessOutputs(const Nnet &nnet, + const NnetChainExample &eg, NnetComputer *computer, + NnetChainExample *eg_out) { + *eg_out = eg; + + // There will normally be just one output here, named 'output', + // but the code is more general than this. + std::vector::const_iterator iter = eg.outputs.begin(), + end = eg.outputs.end(); + std::vector::iterator out_iter = eg_out->outputs.begin(), + out_end = eg_out->outputs.end(); + for (; iter != end; ++iter, ++out_iter) { + const NnetChainSupervision &sup = *iter; + int32 node_index = nnet.GetNodeIndex(sup.name); + if (node_index < 0 || + !nnet.IsOutputNode(node_index)) + KALDI_ERR << "Network has no output named " << sup.name; + + const CuMatrixBase &nnet_output = computer->GetOutput(sup.name); + + CuMatrix numerator_post( + nnet_output.NumRows(), nnet_output.NumCols(), kUndefined); + chain::ComputeChainNumeratorPost(sup.supervision, + nnet_output, &numerator_post); + + out_iter->supervision.numerator_post_targets = + SparseMatrix(Matrix(numerator_post)); + } +} + +void ComputeNumeratorPost(const NnetComputeProbOptions &nnet_config, + const Nnet &nnet, + CachingOptimizingCompiler *compiler, + const NnetChainExample &eg, + NnetChainExample *eg_out) { + bool need_model_derivative = false, store_component_stats = false, + use_xent_regularization = false, use_xent_derivative = false; + + ComputationRequest request; + GetChainComputationRequest(nnet, eg, need_model_derivative, + store_component_stats, use_xent_regularization, + use_xent_derivative, &request); + + std::shared_ptr computation = compiler->Compile(request); + NnetComputer computer(nnet_config.compute_config, *computation, + nnet, NULL); + // give the inputs to the computer object. + computer.AcceptInputs(nnet, eg.inputs); + computer.Run(); + ProcessOutputs(nnet, eg, &computer, eg_out); +} + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Computes the numerator posteriors per frame of the given data with \n" + "an nnet3+chain neural net and outputs egs that include those \n" + "numerator posteriors. The input of this is the output of\n" + "e.g. nnet3-chain-get-egs |\n" + "\n" + "Usage: nnet3-chain-compute-numerator-post [options] \n" + "e.g.: nnet3-chain-compute-numerator-post 0.mdl ark:cegs.1.ark ark:cegs_out.1.ark\n"; + + bool batchnorm_test_mode = true, dropout_test_mode = true; + + // This program doesn't support using a GPU, because these probabilities are + // used for diagnostics, and you can just compute them with a small enough + // amount of data that a CPU can do it within reasonable time. + // It wouldn't be hard to make it support GPU, though. + + NnetComputeProbOptions nnet_opts; + + ParseOptions po(usage); + + po.Register("batchnorm-test-mode", &batchnorm_test_mode, + "If true, set test-mode to true on any BatchNormComponents."); + po.Register("dropout-test-mode", &dropout_test_mode, + "If true, set test-mode to true on any DropoutComponents and " + "DropoutMaskComponents."); + + nnet_opts.Register(&po); + + po.Read(argc, argv); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + nnet_opts.compute_deriv = false; + + std::string nnet_rxfilename = po.GetArg(1), + examples_rspecifier = po.GetArg(2), + examples_wspecifier = po.GetArg(3); + + Nnet nnet; + ReadKaldiObject(nnet_rxfilename, &nnet); + + if (batchnorm_test_mode) + SetBatchnormTestMode(true, &nnet); + + if (dropout_test_mode) + SetDropoutTestMode(true, &nnet); + + SequentialNnetChainExampleReader example_reader(examples_rspecifier); + NnetChainExampleWriter example_writer(examples_wspecifier); + + CachingOptimizingCompiler compiler(nnet, nnet_opts.optimize_config, + nnet_opts.compiler_config); + + int32 num_done = 0; + for (; !example_reader.Done(); example_reader.Next()) { + NnetChainExample eg_out; + ComputeNumeratorPost(nnet_opts, nnet, &compiler, + example_reader.Value(), &eg_out); + example_writer.Write(example_reader.Key(), eg_out); + num_done++; + } + + return (num_done > 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + diff --git a/src/chainbin/nnet3-chain-compute-prob.cc b/src/chainbin/nnet3-chain-compute-prob.cc index 8cf25d4ad08..01cce6d4165 100644 --- a/src/chainbin/nnet3-chain-compute-prob.cc +++ b/src/chainbin/nnet3-chain-compute-prob.cc @@ -82,8 +82,11 @@ int main(int argc, char *argv[]) { fst::StdVectorFst den_fst; ReadFstKaldi(den_fst_rxfilename, &den_fst); + if (GetVerboseLevel() > 2) + nnet_opts.compute_deriv = true; + NnetChainComputeProb chain_prob_computer(nnet_opts, chain_opts, den_fst, - nnet); + nnet); SequentialNnetChainExampleReader example_reader(examples_rspecifier); diff --git a/src/chainbin/nnet3-chain-get-egs-post.cc b/src/chainbin/nnet3-chain-get-egs-post.cc new file mode 100644 index 00000000000..1cfe7d1cf6a --- /dev/null +++ b/src/chainbin/nnet3-chain-get-egs-post.cc @@ -0,0 +1,431 @@ +// chainbin/nnet3-chain-get-egs.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "hmm/posterior.h" +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-example-utils.h" +#include "lat/lattice-functions.h" +#include "chain/chain-supervision.h" + +namespace kaldi { +namespace nnet3 { + +/** This function converts lattice to FSA with weight equal to + sum of acoustic and language score, and pdf_id + 1 as labels. + This assumes that the acoustic and language scores are scaled appropriately. +*/ +void ConvertLatticeToPdfLabels( + const TransitionModel &tmodel, + const Lattice &ifst, + fst::StdVectorFst *ofst) { + typedef fst::ArcTpl ArcIn; + typedef fst::StdArc ArcOut; + typedef ArcIn::StateId StateId; + ofst->DeleteStates(); + // The states will be numbered exactly the same as the original FST. + // Add the states to the new FST. + StateId num_states = ifst.NumStates(); + for (StateId s = 0; s < num_states; s++) + ofst->AddState(); + ofst->SetStart(ifst.Start()); + for (StateId s = 0; s < num_states; s++) { + LatticeWeight final_iweight = ifst.Final(s); + if (final_iweight != LatticeWeight::Zero()) { + fst::TropicalWeight final_oweight; + ConvertLatticeWeight(final_iweight, &final_oweight); + ofst->SetFinal(s, final_oweight); + } + for (fst::ArcIterator iter(ifst, s); + !iter.Done(); + iter.Next()) { + const ArcIn &arc = iter.Value(); + KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero()); + ArcOut oarc; + ConvertLatticeWeight(arc.weight, &oarc.weight); + if (arc.ilabel == 0) + oarc.ilabel = 0; // epsilon arc + else + oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; // pdf + 1 + oarc.olabel = oarc.ilabel; + oarc.nextstate = arc.nextstate; + ofst->AddArc(s, oarc); + } + } +} + + +/** + This function does all the processing for one utterance, and outputs the + supervision objects to 'example_writer'. Note: if normalization_fst is the + empty FST (with no states), it skips the final stage of egs preparation and + you should do it later with nnet3-chain-normalize-egs. +*/ + +static bool ProcessFile(const GeneralMatrix &feats, + const MatrixBase *ivector_feats, + int32 ivector_period, + const Posterior &pdf_post, + BaseFloat min_post, + const VectorBase *deriv_weights, + int32 supervision_length_tolerance, + const std::string &utt_id, + bool compress, + int32 num_pdfs, + UtteranceSplitter *utt_splitter, + NnetChainExampleWriter *example_writer) { + //KALDI_ASSERT(supervision.num_sequences == 1); + int32 num_input_frames = feats.NumRows(); + int32 num_output_frames = pdf_post.size(); + + if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames)) + return false; // LengthsMatch() will have printed a warning. + + std::vector chunks; + + utt_splitter->GetChunksForUtterance(num_input_frames, &chunks); + + if (chunks.empty()) { + KALDI_WARN << "Not producing egs for utterance " << utt_id + << " because it is too short: " + << num_input_frames << " frames."; + return false; + } + + int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; + + if (deriv_weights && (std::abs(deriv_weights->Dim() - num_output_frames) + > supervision_length_tolerance)) { + KALDI_WARN << "For utterance " << utt_id + << ", mismatch between deriv-weights dim and num-output-frames" + << "; " << deriv_weights->Dim() << " vs " << num_output_frames; + return false; + } + + for (size_t c = 0; c < chunks.size(); c++) { + ChunkTimeInfo &chunk = chunks[c]; + + int32 tot_input_frames = chunk.left_context + chunk.num_frames + + chunk.right_context, + start_frame = chunk.first_frame - chunk.left_context; + + GeneralMatrix input_frames; + ExtractRowRangeWithPadding(feats, start_frame, tot_input_frames, + &input_frames); + + NnetChainExample eg; + // call the regular input "input". + eg.inputs.push_back(NnetIo("input", -chunk.left_context, input_frames)); + + if (ivector_feats != NULL) { + // if applicable, add the iVector feature. + // choose iVector from a random frame in the chunk + int32 ivector_frame = RandInt(start_frame, + start_frame + num_input_frames - 1), + ivector_frame_subsampled = ivector_frame / ivector_period; + if (ivector_frame_subsampled < 0) + ivector_frame_subsampled = 0; + if (ivector_frame_subsampled >= ivector_feats->NumRows()) + ivector_frame_subsampled = ivector_feats->NumRows() - 1; + Matrix ivector(1, ivector_feats->NumCols()); + ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled)); + eg.inputs.push_back(NnetIo("ivector", 0, ivector)); + } + + // Note: chunk.first_frame and chunk.num_frames will both be + // multiples of frame_subsampling_factor. + int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor, + num_frames_subsampled = chunk.num_frames / frame_subsampling_factor; + + // Substract 1 from post to convert it back to pdf-id. + Posterior labels(num_frames_subsampled); + + for (int i = 0; i < num_frames_subsampled; i++) { + int t = i + start_frame_subsampled; + if (t < pdf_post.size()) { + for (int32 j = 0; j < pdf_post[t].size(); j++) { + BaseFloat post = pdf_post[t][j].second; + KALDI_ASSERT(pdf_post[t][j].first > 0); + if (post > min_post) { + labels[i].push_back(std::make_pair( + pdf_post[t][j].first - 1, post)); // Convert from 1-index to 0-index + } + } + } + } + + SubVector output_weights( + &(chunk.output_weights[0]), + static_cast(chunk.output_weights.size())); + KALDI_ASSERT(output_weights.Dim() == num_frames_subsampled); + + chain::Supervision supervision(num_pdfs, labels); + if (!deriv_weights) { + eg.outputs.push_back(NnetChainSupervision("output", supervision, output_weights, + 0, frame_subsampling_factor)); + } else { + Vector this_deriv_weights(num_frames_subsampled); + for (int32 i = 0; i < num_frames_subsampled; i++) { + int32 t = i + start_frame_subsampled; + if (t < deriv_weights->Dim()) + this_deriv_weights(i) = (*deriv_weights)(t); + } + eg.outputs.push_back(NnetChainSupervision("output", supervision, this_deriv_weights, + 0, frame_subsampling_factor)); + } + + if (compress) + eg.Compress(); + + std::ostringstream os; + os << utt_id << "-" << chunk.first_frame; + + std::string key = os.str(); // key is - + + example_writer->Write(key, eg); + } + return true; +} + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + using namespace kaldi::chain; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Get frame-by-frame examples of data for nnet3+chain neural network\n" + "training. This involves breaking up utterances into pieces of a\n" + "fixed size. \n" + "The input is lattice and it will transform into new lattice " + "with pdf labels. The it will compose with " + "and does forward backward to get posterior.\n" + "This egs generation can be used for teacher student learning setup \n" + "where the lattice extracted from teacher network.\n" + "Note: if is not supplied the egs will not be\n" + "ready for training; in that case they should later be processed\n" + "with nnet3-chain-normalize-egs\n" + "\n" + "Usage: nnet3-chain-get-egs-post [options] [] " + " \n" + "\n" + "An example [where $feats expands to the actual features]:\n" + "nnet3-chain-get-egs-post --left-context=25 --right-context=9\n" + "--num-frames=20 dir/normalization.fst \"$feats\" \n" + "ark:lat.1.ark ark:cegs.1.ark"; + + bool compress = true; + int32 length_tolerance = 100, online_ivector_period = 1, + supervision_length_tolerance = 1; + + ExampleGenerationConfig eg_config; // controls num-frames, + // left/right-context, etc. + + int32 srand_seed = 0; + std::string online_ivector_rspecifier, + deriv_weights_rspecifier; + BaseFloat min_post = 1e-8, lm_scale = 0.5, acoustic_scale = 1.0; + + ParseOptions po(usage); + po.Register("compress", &compress, "If true, write egs with input features " + "in compressed format (recommended). Update: this is now " + "only relevant if the features being read are un-compressed; " + "if already compressed, we keep we same compressed format when " + "dumping-egs."); + po.Register("ivectors", &online_ivector_rspecifier, "Alias for " + "--online-ivectors option, for back compatibility"); + po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of " + "ivector features, as a matrix." + "--online-ivectors option"); + po.Register("online-ivector-period", &online_ivector_period, "Number of " + "frames between iVectors in matrices supplied to the " + "--online-ivectors option"); + po.Register("srand", &srand_seed, "Seed for random number generator "); + po.Register("length-tolerance", &length_tolerance, "Tolerance for " + "difference in num-frames between feat and ivector matrices"); + po.Register("supervision-length-tolerance", &supervision_length_tolerance, "Tolerance for " + "difference in num-frames-subsampled between supervision and deriv weights"); + po.Register("min-post", &min_post, "Minimum posterior to keep; this will " + "avoid dumping out all posteriors."); + po.Register("acoustic-scale", &acoustic_scale, + "Scale on the acoustic scores in the lattice"); + po.Register("lm-scale", &lm_scale, + "Scale the LM weights on the lattice and interpolate with " + "1-lm-scale times the normalization FST"); + po.Register("deriv-weights-rspecifier", &deriv_weights_rspecifier, + "Not implemented"); + + eg_config.Register(&po); + + po.Read(argc, argv); + + srand(srand_seed); + + if (po.NumArgs() < 4 || po.NumArgs() > 5) { + po.PrintUsage(); + exit(1); + } + + std::string + normalization_fst_rxfilename, + trans_model, + feature_rspecifier, + lattice_rspecifier, + examples_wspecifier; + if (po.NumArgs() == 4) { + trans_model = po.GetArg(1); + feature_rspecifier = po.GetArg(2); + lattice_rspecifier = po.GetArg(3); + examples_wspecifier = po.GetArg(4); + } else { + normalization_fst_rxfilename = po.GetArg(1); + KALDI_ASSERT(!normalization_fst_rxfilename.empty()); + trans_model = po.GetArg(2); + feature_rspecifier = po.GetArg(3); + lattice_rspecifier = po.GetArg(4); + examples_wspecifier = po.GetArg(5); + } + + TransitionModel tmodel; + ReadKaldiObject(trans_model, &tmodel); + + eg_config.ComputeDerived(); + UtteranceSplitter utt_splitter(eg_config); + + fst::StdVectorFst normalization_fst; + if (!normalization_fst_rxfilename.empty()) { + ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); + KALDI_ASSERT(normalization_fst.NumStates() > 0); + + ApplyProbabilityScale(1.0 - lm_scale, &normalization_fst); + } + + // Read as GeneralMatrix so we don't need to un-compress and re-compress + // when selecting parts of matrices. + SequentialGeneralMatrixReader feat_reader(feature_rspecifier); + //chain::RandomAccessSupervisionReader supervision_reader( + // supervision_rspecifier); + RandomAccessLatticeReader lattice_reader(lattice_rspecifier); + NnetChainExampleWriter example_writer(examples_wspecifier); + RandomAccessBaseFloatMatrixReader online_ivector_reader( + online_ivector_rspecifier); + RandomAccessBaseFloatVectorReader deriv_weights_reader( + deriv_weights_rspecifier); + + int32 num_err = 0, num_done = 0; + + for (; !feat_reader.Done(); feat_reader.Next()) { + std::string key = feat_reader.Key(); + const GeneralMatrix &feats = feat_reader.Value(); + if (!lattice_reader.HasKey(key)) { + KALDI_WARN << "No pdf-level posterior for key " << key; + num_err++; + } else { + //const chain::Supervision &supervision = supervision_reader.Value(key); + Lattice lat = lattice_reader.Value(key); + const Matrix *online_ivector_feats = NULL; + if (!online_ivector_rspecifier.empty()) { + if (!online_ivector_reader.HasKey(key)) { + KALDI_WARN << "No iVectors for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + online_ivector_feats = &(online_ivector_reader.Value(key)); + } + } + if (online_ivector_feats != NULL && + (abs(feats.NumRows() - (online_ivector_feats->NumRows() * + online_ivector_period)) > length_tolerance + || online_ivector_feats->NumRows() == 0)) { + KALDI_WARN << "Length difference between feats " << feats.NumRows() + << " and iVectors " << online_ivector_feats->NumRows() + << "exceeds tolerance " << length_tolerance; + num_err++; + continue; + } + + fst::StdVectorFst sup_fst; + fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &lat); + ConvertLatticeToPdfLabels(tmodel, lat, &sup_fst); + + if (normalization_fst.NumStates() > 0 && + !chain::AddWeightToFst(normalization_fst, &sup_fst)) { + KALDI_WARN << "For utterance " << key << ", feature frames " + << ", FST was empty after composing with normalization FST. " + << "This should be extremely rare (a few per corpus, at most)"; + } + + // Convert fst to lattice to extract posterior using forward backward. + Lattice sup_lat; + ConvertFstToLattice(sup_fst, &sup_lat); + + kaldi::uint64 props = sup_lat.Properties(fst::kFstProperties, false); + if (!(props & fst::kTopSorted)) { + if (fst::TopSort(&sup_lat) == false) + KALDI_ERR << "Cycles detected in lattice."; + } + + Posterior pdf_post; + LatticeForwardBackward(sup_lat, &pdf_post); + + const Vector *deriv_weights = NULL; + if (!deriv_weights_rspecifier.empty()) { + if (!deriv_weights_reader.HasKey(key)) { + KALDI_WARN << "No deriv weights for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + deriv_weights = &(deriv_weights_reader.Value(key)); + } + } + + if (!ProcessFile(feats, online_ivector_feats, online_ivector_period, + pdf_post, min_post, deriv_weights, supervision_length_tolerance, + key, compress, tmodel.NumPdfs(), + &utt_splitter, &example_writer)) + num_err++; + num_done++; + } + } + + if (num_err > 0) + KALDI_WARN << "Processed " << num_done << " utterances; " + << num_err << " utterances had errors and could " + "not be processed."; + // utt_splitter prints stats in its destructor. + return utt_splitter.ExitStatus(); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index 1032b7e2125..7a7ccb364ee 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -161,6 +161,7 @@ static bool ProcessFile(const TransitionModel *trans_mdl, << (chunk.first_frame + chunk.num_frames) << ", FST was empty after composing with normalization FST. " << "This should be extremely rare (a few per corpus, at most)"; + continue; } int32 first_frame = 0; // we shift the time-indexes of all these parts so @@ -172,6 +173,7 @@ static bool ProcessFile(const TransitionModel *trans_mdl, SubVector output_weights( &(chunk.output_weights[0]), static_cast(chunk.output_weights.size())); + KALDI_ASSERT(output_weights.Dim() == num_frames_subsampled); if (!deriv_weights) { NnetChainSupervision nnet_supervision("output", supervision_part, @@ -186,7 +188,6 @@ static bool ProcessFile(const TransitionModel *trans_mdl, if (t < deriv_weights->Dim()) this_deriv_weights(i) = (*deriv_weights)(t); } - KALDI_ASSERT(output_weights.Dim() == num_frames_subsampled); this_deriv_weights.MulElements(output_weights); NnetChainSupervision nnet_supervision("output", supervision_part, this_deriv_weights, diff --git a/src/chainbin/nnet3-chain-split-and-get-egs.cc b/src/chainbin/nnet3-chain-split-and-get-egs.cc new file mode 100644 index 00000000000..c5c14b45679 --- /dev/null +++ b/src/chainbin/nnet3-chain-split-and-get-egs.cc @@ -0,0 +1,507 @@ +// chainbin/nnet3-chain-get-egs.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "hmm/posterior.h" +#include "chain/chain-supervision-splitter.h" +#include "lat/lattice-functions.h" +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-example-utils.h" + +namespace kaldi { +namespace nnet3 { + +/** + This function does all the processing for one utterance, and outputs the + supervision objects to 'example_writer'. Note: if normalization_fst is the + empty FST (with no states), it skips the final stage of egs preparation and + you should do it later with nnet3-chain-normalize-egs. +*/ + +static bool ProcessFile(const chain::SupervisionOptions &sup_opts, + const fst::StdVectorFst &normalization_fst, + const GeneralMatrix &feats, + const MatrixBase *ivector_feats, + int32 ivector_period, + const TransitionModel &trans_model, + const chain::SupervisionLatticeSplitter &sup_lat_splitter, + const VectorBase *deriv_weights, + const Posterior *graph_posteriors, BaseFloat min_post, + int32 supervision_length_tolerance, + const std::string &utt_id, + bool compress, + UtteranceSplitter *utt_splitter, + NnetChainExampleWriter *example_writer, + bool add_numerator_post = false) { + + int32 num_input_frames = feats.NumRows(); + + std::vector state_times; + + int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; + int32 num_output_frames = sup_lat_splitter.NumFrames(); + + if (deriv_weights && (std::abs(deriv_weights->Dim() - num_output_frames) + > supervision_length_tolerance)) { + KALDI_WARN << "For utterance " << utt_id + << ", mismatch between deriv-weights dim and num-output-frames" + << "; " << deriv_weights->Dim() << " vs " << num_output_frames; + return false; + } + + if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames, + supervision_length_tolerance)) + return false; // LengthsMatch() will have printed a warning. + + std::vector chunks; + + utt_splitter->GetChunksForUtterance(num_input_frames, &chunks); + + if (chunks.empty()) { + KALDI_WARN << "Not producing egs for utterance " << utt_id + << " because it is too short: " + << num_input_frames << " frames."; + return false; + } + + for (size_t c = 0; c < chunks.size(); c++) { + ChunkTimeInfo &chunk = chunks[c]; + + int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor, + num_frames_subsampled = chunk.num_frames / frame_subsampling_factor; + + + chain::Supervision supervision_part; + Lattice *lat_part = NULL; + + if (add_numerator_post) + lat_part = new Lattice(); + + if (!sup_lat_splitter.GetFrameRangeSupervision(start_frame_subsampled, + num_frames_subsampled, + &supervision_part, + NULL, lat_part)) { + delete lat_part; + continue; + } + + if (add_numerator_post) { + Posterior post_part; + if (!chain::LatticeToNumeratorPost(*lat_part, trans_model, + normalization_fst, &post_part)) { + delete lat_part; + continue; + } + KALDI_ASSERT(post_part.size() == num_frames_subsampled); + + Posterior labels(num_frames_subsampled); + + for (int32 i = 0; i < num_frames_subsampled; i++) { + for (int32 j = 0; j < post_part[i].size(); j++) { + BaseFloat post = post_part[i][j].second; + KALDI_ASSERT(post_part[i][j].first > 0); + if (post > min_post) { + labels[i].push_back(std::make_pair( + post_part[i][j].first - 1, post)); // Convert from 1-index to 0-index + } + } + } + + SparseMatrix smat(trans_model.NumPdfs(), labels); + supervision_part.numerator_post_targets = smat; + + delete lat_part; + } else if (graph_posteriors) { + Posterior labels; + labels.resize(num_frames_subsampled); + for (int32 i = 0; i < num_frames_subsampled; i++) { + int32 t = i + start_frame_subsampled; + for (int32 j = 0; j < (*graph_posteriors)[t].size(); j++) { + BaseFloat post = (*graph_posteriors)[t][j].second; + KALDI_ASSERT((*graph_posteriors)[t][j].first > 0); + if (post > min_post) { + labels[i].push_back(std::make_pair( + (*graph_posteriors)[t][j].first - 1, post)); // Convert from 1-index to 0-index + } + } + } + + SparseMatrix smat(trans_model.NumPdfs(), labels); + supervision_part.numerator_post_targets = smat; + } + + if (normalization_fst.NumStates() > 0 && + !chain::AddWeightToSupervisionFst(normalization_fst, + &supervision_part)) { + KALDI_WARN << "For utterance " << utt_id << ", feature frames " + << chunk.first_frame << " to " + << (chunk.first_frame + chunk.num_frames) + << ", FST was empty after composing with normalization FST. " + << "This should be extremely rare (a few per corpus, at most)"; + continue; + } + + int32 first_frame = 0; // we shift the time-indexes of all these parts so + // that the supervised part starts from frame 0. + + NnetChainExample nnet_chain_eg; + nnet_chain_eg.outputs.resize(1); + + SubVector output_weights( + &(chunk.output_weights[0]), + static_cast(chunk.output_weights.size())); + + if (!deriv_weights) { + NnetChainSupervision nnet_supervision("output", supervision_part, + output_weights, + first_frame, + frame_subsampling_factor); + nnet_chain_eg.outputs[0].Swap(&nnet_supervision); + } else { + Vector this_deriv_weights(num_frames_subsampled); + for (int32 i = 0; i < num_frames_subsampled; i++) { + int32 t = i + start_frame_subsampled; + if (t < deriv_weights->Dim()) + this_deriv_weights(i) = (*deriv_weights)(t); + } + KALDI_ASSERT(output_weights.Dim() == num_frames_subsampled); + this_deriv_weights.MulElements(output_weights); + NnetChainSupervision nnet_supervision("output", supervision_part, + this_deriv_weights, + first_frame, + frame_subsampling_factor); + nnet_chain_eg.outputs[0].Swap(&nnet_supervision); + } + + nnet_chain_eg.inputs.resize(ivector_feats != NULL ? 2 : 1); + + int32 tot_input_frames = chunk.left_context + chunk.num_frames + + chunk.right_context, + start_frame = chunk.first_frame - chunk.left_context; + + GeneralMatrix input_frames; + ExtractRowRangeWithPadding(feats, start_frame, tot_input_frames, + &input_frames); + + NnetIo input_io("input", -chunk.left_context, input_frames); + nnet_chain_eg.inputs[0].Swap(&input_io); + + if (ivector_feats != NULL) { + // if applicable, add the iVector feature. + // choose iVector from a random frame in the chunk + int32 ivector_frame = RandInt(start_frame, + start_frame + num_input_frames - 1), + ivector_frame_subsampled = ivector_frame / ivector_period; + if (ivector_frame_subsampled < 0) + ivector_frame_subsampled = 0; + if (ivector_frame_subsampled >= ivector_feats->NumRows()) + ivector_frame_subsampled = ivector_feats->NumRows() - 1; + Matrix ivector(1, ivector_feats->NumCols()); + ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled)); + NnetIo ivector_io("ivector", 0, ivector); + nnet_chain_eg.inputs[1].Swap(&ivector_io); + } + + if (compress) + nnet_chain_eg.Compress(); + + std::ostringstream os; + os << utt_id << "-" << chunk.first_frame; + + std::string key = os.str(); // key is - + + example_writer->Write(key, nnet_chain_eg); + } + return true; +} + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Get frame-by-frame examples of data for nnet3+chain neural network\n" + "training. This involves breaking up utterances into pieces of a\n" + "fixed size. Input will come from non-compact phone lattice.\n" + "Note: if is not supplied the egs will not be\n" + "ready for training; in that case they should later be processed\n" + "with nnet3-chain-normalize-egs\n" + "\n" + "Usage: nnet3-chain-split-and-get-egs [options] [] " + " \n" + "\n" + "An example [where $feats expands to the actual features]:\n" + "lattice-copy --write-compact=false ark:1.lat ark:- | \\ \n" + " nnet3-chain-split-and-get-egs --left-context=25 --right-context=9 --num-frames=20 dir/normalization.fst \\\n" + " \"$feats\" dir/tree dir/0.trans_mdl ark,s,cs:- ark:cegs.1.ark\n"; + + bool compress = true; + int32 length_tolerance = 100, online_ivector_period = 1, + supervision_length_tolerance = 1; + + ExampleGenerationConfig eg_config; // controls num-frames, + // left/right-context, etc. + chain::SupervisionOptions sup_opts; + + int32 srand_seed = 0; + std::string online_ivector_rspecifier, deriv_weights_rspecifier, + graph_posterior_rspecifier; + std::string den_fst_rxfilename; + + BaseFloat min_post = 1e-8; + bool add_numerator_post = false; + + ParseOptions po(usage); + po.Register("compress", &compress, "If true, write egs with input features " + "in compressed format (recommended). Update: this is now " + "only relevant if the features being read are un-compressed; " + "if already compressed, we keep we same compressed format when " + "dumping-egs."); + po.Register("ivectors", &online_ivector_rspecifier, "Alias for " + "--online-ivectors option, for back compatibility"); + po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of " + "ivector features, as a matrix."); + po.Register("online-ivector-period", &online_ivector_period, "Number of " + "frames between iVectors in matrices supplied to the " + "--online-ivectors option"); + po.Register("srand", &srand_seed, "Seed for random number generator "); + po.Register("length-tolerance", &length_tolerance, "Tolerance for " + "difference in num-frames between feat and ivector matrices"); + po.Register("supervision-length-tolerance", &supervision_length_tolerance, "Tolerance for " + "difference in num-frames-subsampled between supervision and deriv weights"); + po.Register("deriv-weights-rspecifier", &deriv_weights_rspecifier, + "Per-frame weights (only binary - 0 or 1) that specifies " + "whether a frame's gradient must be backpropagated or not. " + "Not specifying this is equivalent to specifying a vector of " + "all 1s."); + po.Register("graph-posterior-rspecifier", &graph_posterior_rspecifier, + "Pdf posteriors where the labels are 1-indexed"); + po.Register("min-post", &min_post, "Minimum posterior to keep; this will " + "avoid dumping out all posteriors."); + po.Register("add-numerator-post", &add_numerator_post, + "Add numerator post to supervision; this is alternative to " + "graph-posterior-rspecifier"); + po.Register("den-fst", &den_fst_rxfilename, + "If provided, will compose this with the lattice " + "before splitting."); + + eg_config.Register(&po); + + ParseOptions supervision_opts("supervision", &po); + sup_opts.Register(&supervision_opts); + + chain::SupervisionLatticeSplitterOptions sup_lat_splitter_opts; + sup_lat_splitter_opts.Register(&po); + + po.Read(argc, argv); + + srand(srand_seed); + + if (po.NumArgs() < 5 || po.NumArgs() > 6) { + po.PrintUsage(); + exit(1); + } + + std::string + normalization_fst_rxfilename, + feature_rspecifier, + tree_rxfilename, trans_model_rxfilename, + lattice_rspecifier, + examples_wspecifier; + if (po.NumArgs() == 5) { + feature_rspecifier = po.GetArg(1); + tree_rxfilename = po.GetArg(2); + trans_model_rxfilename = po.GetArg(3); + lattice_rspecifier = po.GetArg(4); + examples_wspecifier = po.GetArg(5); + } else { + normalization_fst_rxfilename = po.GetArg(1); + KALDI_ASSERT(!normalization_fst_rxfilename.empty()); + feature_rspecifier = po.GetArg(2); + tree_rxfilename = po.GetArg(3); + trans_model_rxfilename = po.GetArg(4); + lattice_rspecifier = po.GetArg(5); + examples_wspecifier = po.GetArg(6); + } + + eg_config.ComputeDerived(); + UtteranceSplitter utt_splitter(eg_config); + + if (add_numerator_post) + KALDI_ASSERT(!normalization_fst_rxfilename.empty() || + !den_fst_rxfilename.empty()); + + fst::StdVectorFst normalization_fst; + if (!normalization_fst_rxfilename.empty()) { + ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); + KALDI_ASSERT(normalization_fst.NumStates() > 0); + + if (sup_opts.lm_scale < 0.0 || sup_opts.lm_scale > 1.0) { + KALDI_ERR << "Invalid lm-scale; must be in [0.0, 1.0]"; + } + + if (sup_opts.lm_scale != 0.0) { + fst::ApplyProbabilityScale(1.0 - sup_opts.lm_scale, &normalization_fst); + } + } + + fst::StdVectorFst den_fst; + if (!den_fst_rxfilename.empty()) { + KALDI_LOG << "Adding weights from denominator FST before splitting."; + + normalization_fst = den_fst; // clear normalization FST + + ReadFstKaldi(den_fst_rxfilename, &den_fst); + KALDI_ASSERT(den_fst.NumStates() > 0); + + if (sup_opts.lm_scale < 0.0 || sup_opts.lm_scale >= 1.0) { + KALDI_ERR << "Invalid lm-scale; must be in [0.0, 1.0]"; + } + + if (sup_opts.lm_scale != 0.0) { + fst::ApplyProbabilityScale(1.0 - sup_opts.lm_scale, &den_fst); + } + } + + // Read as GeneralMatrix so we don't need to un-compress and re-compress + // when selecting parts of matrices. + SequentialGeneralMatrixReader feat_reader(feature_rspecifier); + + TransitionModel trans_model; + ReadKaldiObject(trans_model_rxfilename, &trans_model); + + ContextDependency ctx_dep; + ReadKaldiObject(tree_rxfilename, &ctx_dep); + + RandomAccessLatticeReader lattice_reader( + lattice_rspecifier); + NnetChainExampleWriter example_writer(examples_wspecifier); + RandomAccessBaseFloatMatrixReader online_ivector_reader( + online_ivector_rspecifier); + RandomAccessBaseFloatVectorReader deriv_weights_reader( + deriv_weights_rspecifier); + RandomAccessPosteriorReader graph_posterior_reader( + graph_posterior_rspecifier); + + int32 num_err = 0; + + KALDI_ASSERT(sup_opts.frame_subsampling_factor == 1); + + // We require alignments to be from the same chain model + // If den_fst is not empty, it will be composed with the lattice + // before splitting. + chain::SupervisionLatticeSplitter sup_lat_splitter( + sup_lat_splitter_opts, sup_opts, trans_model, den_fst); + + for (; !feat_reader.Done(); feat_reader.Next()) { + std::string key = feat_reader.Key(); + const GeneralMatrix &feats = feat_reader.Value(); + if (!lattice_reader.HasKey(key)) { + KALDI_WARN << "No lattice for key " << key; + num_err++; + } else { + const Lattice &lat = lattice_reader.Value(key); + const Matrix *online_ivector_feats = NULL; + if (!online_ivector_rspecifier.empty()) { + if (!online_ivector_reader.HasKey(key)) { + KALDI_WARN << "No iVectors for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + online_ivector_feats = &(online_ivector_reader.Value(key)); + } + } + if (online_ivector_feats != NULL && + (abs(feats.NumRows() - (online_ivector_feats->NumRows() * + online_ivector_period)) > length_tolerance + || online_ivector_feats->NumRows() == 0)) { + KALDI_WARN << "Length difference between feats " << feats.NumRows() + << " and iVectors " << online_ivector_feats->NumRows() + << "exceeds tolerance " << length_tolerance; + num_err++; + continue; + } + + const Vector *deriv_weights = NULL; + if (!deriv_weights_rspecifier.empty()) { + if (!deriv_weights_reader.HasKey(key)) { + KALDI_WARN << "No deriv weights for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + deriv_weights = &(deriv_weights_reader.Value(key)); + } + } + + const Posterior *graph_posteriors = NULL; + if (!graph_posterior_rspecifier.empty()) { + if (!graph_posterior_reader.HasKey(key)) { + KALDI_WARN << "No graph posteriors for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + graph_posteriors = &(graph_posterior_reader.Value(key)); + } + } + + if (!sup_lat_splitter.LoadLattice(lat)) { + KALDI_WARN << "For utterance " << key + << ", FST was empty after composing with denominator FST. " + << "This should be extremely rare (a few per corpus, at most)"; + num_err++; + continue; + } + + if (!ProcessFile(sup_opts, normalization_fst, feats, + online_ivector_feats, online_ivector_period, + trans_model, sup_lat_splitter, + deriv_weights, graph_posteriors, min_post, + supervision_length_tolerance, + key, compress, + &utt_splitter, &example_writer, add_numerator_post)) + num_err++; + } + } + if (num_err > 0) + KALDI_WARN << num_err << " utterances had errors and could " + "not be processed."; + // utt_splitter prints stats in its destructor. + return utt_splitter.ExitStatus(); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 6b99a77e73b..4cbbdcab9fd 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -262,6 +262,12 @@ void cudaD_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out, MatrixDim d_out, const double *v_in); void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in); +void cudaD_copy_cols_at_indices_from_vec(dim3 Gr, dim3 Bl, double *mat_out, + const double *v_in, const MatrixIndexT_cuda* indices, + MatrixDim d_out); +void cudaF_copy_cols_at_indices_from_vec(dim3 Gr, dim3 Bl, float *mat_out, + const float *v_in, const MatrixIndexT_cuda* indices, + MatrixDim d_out); void cudaD_copy(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in); void cudaF_copy(dim3 Gr, dim3 Bl, float *y, const float *x, @@ -575,6 +581,10 @@ void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d); void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d); +void cudaD_mul_cols_group_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, + MatrixDim d, int group_size); +void cudaF_mul_cols_group_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, + MatrixDim d, int group_size); void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride); void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A, @@ -750,6 +760,9 @@ void cudaF_vec_min(int Gr, int Bl, const float* v, float* value, int dim, void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim); void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim); +void cudaD_vec_div_elements(int Gr, int Bl, double* v, const double* a, + int dim); +void cudaF_vec_div_elements(int Gr, int Bl, float* v, const float* a, int dim); void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim); void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim); void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc); diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 934a860a055..9cec7e83a7a 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -566,6 +566,14 @@ static void _vec_mul_elements(Real* v, const Real* a, int dim) { v[i] = v[i] * a[i]; } +template +__global__ +static void _vec_div_elements(Real* v, const Real* a, int dim) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < dim) + v[i] = v[i] / a[i]; +} + template __global__ static void _mul_cols_vec(Real* mat, const Real* scale, MatrixDim d) { @@ -576,6 +584,19 @@ static void _mul_cols_vec(Real* mat, const Real* scale, MatrixDim d) { mat[index] *= scale[i]; } +template +__global__ +static void _mul_cols_group_vec(Real* mat, const Real* scale, MatrixDim d, + int group_size) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; + int32_cuda index = i + j * d.stride; + + if (i < d.cols && j < d.rows) { + mat[index] *= scale[i % group_size]; + } +} + template __global__ static void _mul_rows_vec(Real* mat, const Real* scale, MatrixDim d) { @@ -946,6 +967,25 @@ static void _copy_cols_from_vec(Real* m_out, MatrixDim d, const Real* v_in) { } } +// This kernel writes a copy of the vector "v_in" to each col i of the matrix +// "m_out", where indices[i] != -1. If indices[i] == -1, then that column is +// left as is. +// the dimension of v_in should be equal to the #row of m_out. +// the dimension of indices should be equal to the #col of m_out. +template +__global__ +static void _copy_cols_at_indices_from_vec(Real* m_out, const Real* v_in, + const MatrixIndexT_cuda* indices, + MatrixDim d) { + int i = blockIdx.y * blockDim.y + threadIdx.y; // row id + int j = blockIdx.x * blockDim.x + threadIdx.x; // col id + if (i < d.rows && j < d.cols) { + if (indices[j] != -1) { + m_out[i * d.stride + j] = v_in[i]; + } + } +} + // _trace_mat_mat reduce the partial sum to // value[blockIdx.y * gridDim.x + blockIdx.x] // It use shared mem to transpose matrix B to ensure coalesced memory access @@ -3899,6 +3939,11 @@ void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float* mat, const float* scale, _mul_cols_vec<<>>(mat,scale,d); } +void cudaF_mul_cols_group_vec(dim3 Gr, dim3 Bl, float* mat, const float* scale, + MatrixDim d, int group_size) { + _mul_cols_group_vec<<>>(mat,scale,d,group_size); +} + void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d) { _mul_rows_vec<<>>(mat,scale,d); @@ -4052,6 +4097,10 @@ void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim) { _vec_mul_elements<<>>(v, a, dim); } +void cudaF_vec_div_elements(int Gr, int Bl, float* v, const float* a, int dim) { + _vec_div_elements<<>>(v, a, dim); +} + void cudaF_vec_min(int Gr, int Bl, const float* v, float* value, int dim, int inc) { _vec_transform_reduce<<>>(v, value, dim, inc, @@ -4604,6 +4653,11 @@ void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double* mat, const double* scale, _mul_cols_vec<<>>(mat,scale,d); } +void cudaD_mul_cols_group_vec(dim3 Gr, dim3 Bl, double* mat, const double* scale, + MatrixDim d, int group_size) { + _mul_cols_group_vec<<>>(mat,scale,d,group_size); +} + void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d) { _mul_rows_vec<<>>(mat,scale,d); @@ -4748,6 +4802,11 @@ void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, _vec_mul_elements<<>>(v, a, dim); } +void cudaD_vec_div_elements(int Gr, int Bl, double* v, const double* a, + int dim) { + _vec_div_elements<<>>(v, a, dim); +} + void cudaD_vec_min(int Gr, int Bl, const double* v, double* value, int dim, int inc) { _vec_transform_reduce<<>>(v, value, dim, inc, @@ -5315,6 +5374,20 @@ void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, _copy_cols_from_vec<<>>(mat_out, d_out, v_in); } +void cudaD_copy_cols_at_indices_from_vec(dim3 Gr, dim3 Bl, double *mat_out, + const double *v_in, + const MatrixIndexT_cuda* indices, + MatrixDim d_out) { + _copy_cols_at_indices_from_vec<<>>(mat_out, v_in, indices, d_out); +} + +void cudaF_copy_cols_at_indices_from_vec(dim3 Gr, dim3 Bl, float *mat_out, + const float *v_in, + const MatrixIndexT_cuda* indices, + MatrixDim d_out) { + _copy_cols_at_indices_from_vec<<>>(mat_out, v_in, indices, d_out); +} + void cudaF_diff_normalize_per_row(size_t Gr, size_t Bl, float *id, int id_stride, const float *iv, MatrixDim iv_dim, const float* od, diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 8f719a8c4a1..1ac5efa0c58 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -491,6 +491,16 @@ inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in) { cudaF_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in); } +inline void cuda_copy_cols_at_indices_from_vec(dim3 Gr, dim3 Bl, double *mat_out, + const double *v_in, const MatrixIndexT_cuda* indices, + MatrixDim d_out) { + cudaD_copy_cols_at_indices_from_vec(Gr, Bl, mat_out, v_in, indices, d_out); +} +inline void cuda_copy_cols_at_indices_from_vec(dim3 Gr, dim3 Bl, float *mat_out, + const float *v_in, const MatrixIndexT_cuda* indices, + MatrixDim d_out) { + cudaF_copy_cols_at_indices_from_vec(Gr, Bl, mat_out, v_in, indices, d_out); +} inline void cuda_copy(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { @@ -1073,6 +1083,16 @@ inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_cols_vec(Gr, Bl, mat, scale, d); } +inline void cuda_mul_cols_group_vec(dim3 Gr, dim3 Bl, double *mat, + const double *scale, MatrixDim d, + int group_size) { + cudaD_mul_cols_group_vec(Gr, Bl, mat, scale, d, group_size); +} +inline void cuda_mul_cols_group_vec(dim3 Gr, dim3 Bl, float *mat, + const float *scale, MatrixDim d, + int group_size) { + cudaF_mul_cols_group_vec(Gr, Bl, mat, scale, d, group_size); +} inline void cuda_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride) { cudaD_mul_elements(Gr, Bl, mat, A, dst_d, src_stride); @@ -1466,6 +1486,14 @@ inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim) { cudaF_vec_mul_elements(Gr, Bl, v, a, dim); } +inline void cuda_vec_div_elements(int Gr, int Bl, double* v, const double* a, + int dim) { + cudaD_vec_div_elements(Gr, Bl, v, a, dim); +} +inline void cuda_vec_div_elements(int Gr, int Bl, float* v, const float* a, + int dim) { + cudaF_vec_div_elements(Gr, Bl, v, a, dim); +} inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) { cudaD_vec_soft_max(Gr, Bl, v, dim); } diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index beccd9dc4a5..6327b0860de 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -774,6 +774,34 @@ void CuMatrixBase::MulColsVec(const CuVectorBase &scale) { } +template +void CuMatrixBase::MulColsGroupVec(const CuVectorBase &scale) { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + CuTimer tim; + + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + + cuda_mul_cols_group_vec(dimGrid, dimBlock, data_, scale.data_, Dim(), + scale.Dim()); + CU_SAFE_CALL(cudaGetLastError()); + + + CuDevice::Instantiate().AccuProfile(__func__, tim); + } else +#endif + { + int32 num_groups = NumCols() / scale.Dim(); + for (int32 i = 0; i < num_groups; i++) { + CuSubMatrix this_mat(*this, 0, NumRows(), + i * scale.Dim(), scale.Dim()); + this_mat.Mat().MulColsVec(scale.Vec()); + } + } +} + template void CuMatrixBase::MulRowsVec(const CuVectorBase &scale) { @@ -2383,6 +2411,34 @@ void CuMatrixBase::CopyColsFromVec(const CuVectorBase &rv) { } } +template +void CuMatrixBase::CopyColsFromVec(const CuVectorBase &v, + const CuArray &indices) { + KALDI_ASSERT(indices.Dim() == NumCols()); + KALDI_ASSERT(NumRows() == v.Dim()); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + CuTimer tim; + // use 2D block (8x32) and large enough grid to cover matrix *this + // dimBlock.x need to be at least warpSize for coalesced memory access. + const int32 warpSize = 32; + dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); + dim3 dimGrid(n_blocks(num_cols_, dimBlock.x), + n_blocks(num_rows_, dimBlock.y)); + cuda_copy_cols_at_indices_from_vec(dimGrid, dimBlock, Data(), v.Data(), + indices.Data(), Dim()); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim); + } else +#endif + { + for (MatrixIndexT j = 0; j < NumCols(); j++) { + if (indices.Data()[j] != -1) + Mat().CopyColFromVec(v.Vec(), j); + } + } +} + template void CuMatrixBase::CopyColFromVec(const CuVectorBase &v, @@ -3439,7 +3495,6 @@ std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat); template std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat); - // Instantiate classes CuMatrix and CuMatrixBase for float and double. template class CuMatrix; template class CuMatrix; diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 85aa4c049e7..63c27a9745d 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -101,7 +101,6 @@ class CuMatrixBase { void CopyCols(const CuMatrixBase &src, const CuArrayBase &indexes); - /// Add column indices[r] of src to column r. /// As a special case, if indexes[i] == -1, skip column i /// indices.size() must equal this->NumCols(), @@ -109,6 +108,9 @@ class CuMatrixBase { void AddCols(const CuMatrixBase &src, const CuArrayBase &indices); + /// Sets all elements in column indexes defined by 'ids' to 'value' + void SetCols(Real value, const CuArray &ids); + /// Copies row r from row indexes[r] of src. /// As a special case, if indexes[i] < 0, sets row i to zero. /// src.NumCols() must equal this.NumCols() @@ -267,10 +269,17 @@ class CuMatrixBase { void CopyRowsFromVec(const VectorBase &v); /// Copies vector into matrix, column-by-column. - /// Note that rv.Dim() must either equal NumRows()*NumCols() or NumRows(); + /// Note that v.Dim() must either equal NumRows()*NumCols() or NumRows(); /// this has two modes of operation. void CopyColsFromVec(const CuVectorBase &v); + /// Copies vector into column i of matrix if indices[i] != -1, else keep + /// column i as is. + /// indices.size() must equal this->NumCols(), + /// and v.Dim() must equal this.NumRows() + void CopyColsFromVec(const CuVectorBase &v, + const CuArray &indices); + /// Copy vector into specific column of matrix. void CopyColFromVec(const CuVectorBase &v, const MatrixIndexT col); @@ -446,6 +455,9 @@ class CuMatrixBase { void Min(const CuMatrixBase &A); /// scale i'th column by scale[i] void MulColsVec(const CuVectorBase &scale); + /// Divide each row into groups of size scale.Dim() and multiply + /// j^th element in each group of each row by scale[j]. + void MulColsGroupVec(const CuVectorBase &scale); /// scale i'th row by scale[i] void MulRowsVec(const CuVectorBase &scale); /// divide each row into src.NumCols() groups, and then scale i'th row's jth group of elements by src[i, j]. @@ -650,6 +662,11 @@ class CuMatrixBase { void SetRandUniform(); void Write(std::ostream &os, bool binary) const; + inline std::string ToStr() const { + std::ostringstream oss; + oss << *this; + return oss.str(); + } // This function, adds a list of MatrixElements (scaled by alpha) to corresponding locations to // (*this). @@ -903,6 +920,8 @@ bool SameDimAndStride(const CuMatrixBase &M, const CuMatrixBase &N) template std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat); +template +std::string ToStr(const CuMatrixBase &mat); template template diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h index 69ca2ae3125..ca445cda2e1 100644 --- a/src/cudamatrix/cu-vector.h +++ b/src/cudamatrix/cu-vector.h @@ -299,6 +299,11 @@ class CuVector: public CuVectorBase { /// I/O void Read(std::istream &is, bool binary); void Write(std::ostream &is, bool binary) const; + inline std::string ToStr() const { + std::ostringstream oss; + oss << *this; + return oss.str(); + } private: diff --git a/src/hmm/hmm-test-utils.cc b/src/hmm/hmm-test-utils.cc index ceca116c828..c9b29ce24af 100644 --- a/src/hmm/hmm-test-utils.cc +++ b/src/hmm/hmm-test-utils.cc @@ -23,10 +23,10 @@ namespace kaldi { -TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep_out) { +TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep_out, int32 max_phone) { std::vector phones; phones.push_back(1); - for (int32 i = 2; i < 20; i++) + for (int32 i = 2; i <= max_phone; i++) if (rand() % 2 == 0) phones.push_back(i); int32 N = 2 + rand() % 2, // context-size N is 2 or 3. diff --git a/src/hmm/hmm-test-utils.h b/src/hmm/hmm-test-utils.h index 495ebf278ae..148ac44c1be 100644 --- a/src/hmm/hmm-test-utils.h +++ b/src/hmm/hmm-test-utils.h @@ -33,7 +33,7 @@ namespace kaldi { // This function returns a randomly generated TransitionModel object. // If 'ctx_dep' is not NULL, it outputs to *ctx_dep a pointer to the // tree that was used to generate the transition model. -TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep); +TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep, int32 max_phone = 19); /// This function returns a HmmTopology object giving a normal 3-state topology, /// covering all phones in the list "phones". This is mainly of use in testing diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc index 54c856a9403..b6c3b1aa898 100644 --- a/src/lat/lattice-functions.cc +++ b/src/lat/lattice-functions.cc @@ -388,6 +388,11 @@ BaseFloat LatticeForwardBackward(const Lattice &lat, Posterior *post, if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-8)) { KALDI_WARN << "Total forward probability over lattice = " << tot_forward_prob << ", while total backward probability = " << tot_backward_prob; + + if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-2)) { + KALDI_ERR << "Total forward probability over lattice = " << tot_forward_prob + << ", while total backward probability = " << tot_backward_prob; + } } // Now combine any posteriors with the same transition-id. for (int32 t = 0; t < max_time; t++) @@ -421,7 +426,7 @@ void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans, } void ConvertLatticeToPhones(const TransitionModel &trans, - Lattice *lat) { + Lattice *lat, bool replace_words) { typedef LatticeArc Arc; int32 num_states = lat->NumStates(); for (int32 state = 0; state < num_states; state++) { @@ -433,7 +438,10 @@ void ConvertLatticeToPhones(const TransitionModel &trans, && (trans.TransitionIdToHmmState(arc.ilabel) == 0) && (!trans.IsSelfLoop(arc.ilabel))) { // && trans.IsFinal(arc.ilabel)) // there is one of these per phone... - arc.olabel = trans.TransitionIdToPhone(arc.ilabel); + if (replace_words) + arc.olabel = trans.TransitionIdToPhone(arc.ilabel); + else + arc.ilabel = trans.TransitionIdToPhone(arc.ilabel); } aiter.SetValue(arc); } // end looping over arcs @@ -498,6 +506,11 @@ double ComputeLatticeAlphasAndBetas(const LatticeType &lat, if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-8)) { KALDI_WARN << "Total forward probability over lattice = " << tot_forward_prob << ", while total backward probability = " << tot_backward_prob; + + if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-2)) { + KALDI_ERR << "Total forward probability over lattice = " << tot_forward_prob + << ", while total backward probability = " << tot_backward_prob; + } } // Split the difference when returning... they should be the same. return 0.5 * (tot_backward_prob + tot_forward_prob); diff --git a/src/lat/lattice-functions.h b/src/lat/lattice-functions.h index c7fe4833a4a..163cdca3b30 100644 --- a/src/lat/lattice-functions.h +++ b/src/lat/lattice-functions.h @@ -152,7 +152,7 @@ void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans, /// we do reorder). /// Also see PhoneAlignLattice, in phone-align-lattice.h. void ConvertLatticeToPhones(const TransitionModel &trans_model, - Lattice *lat); + Lattice *lat, bool replace_words = true); /// Prunes a lattice or compact lattice. Returns true on success, false if /// there was some kind of failure. diff --git a/src/latbin/lattice-1best.cc b/src/latbin/lattice-1best.cc index e03736561f8..46f4b5a8add 100644 --- a/src/latbin/lattice-1best.cc +++ b/src/latbin/lattice-1best.cc @@ -66,9 +66,9 @@ int main(int argc, char *argv[]) { lats_wspecifier = po.GetArg(2); SequentialCompactLatticeReader clat_reader(lats_rspecifier); - + // Write as compact lattice. - CompactLatticeWriter compact_1best_writer(lats_wspecifier); + CompactLatticeWriter compact_1best_writer(lats_wspecifier); int32 n_done = 0, n_err = 0; @@ -85,7 +85,7 @@ int main(int argc, char *argv[]) { CompactLattice best_path; CompactLatticeShortestPath(clat, &best_path); - + if (best_path.Start() == fst::kNoStateId) { KALDI_WARN << "Possibly empty lattice for utterance-id " << key << "(no output)"; diff --git a/src/latbin/lattice-align-phones.cc b/src/latbin/lattice-align-phones.cc index 9367fb1f3a7..9f9a11575dc 100644 --- a/src/latbin/lattice-align-phones.cc +++ b/src/latbin/lattice-align-phones.cc @@ -43,8 +43,10 @@ int main(int argc, char *argv[]) { " lattice-1best | nbest-to-prons\n"; ParseOptions po(usage); + bool write_compact = true; bool output_if_error = true; - + + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("output-error-lats", &output_if_error, "Output lattices that aligned " "with errors (e.g. due to force-out"); @@ -66,17 +68,44 @@ int main(int argc, char *argv[]) { TransitionModel tmodel; ReadKaldiObject(model_rxfilename, &tmodel); - SequentialCompactLatticeReader clat_reader(lats_rspecifier); - CompactLatticeWriter clat_writer(lats_wspecifier); + SequentialCompactLatticeReader clat_reader; + CompactLatticeWriter clat_writer; + SequentialLatticeReader lat_reader; + LatticeWriter lat_writer; + + if (write_compact) { + clat_reader.Open(lats_rspecifier); + clat_writer.Open(lats_wspecifier); + } else { + lat_reader.Open(lats_rspecifier); + lat_writer.Open(lats_wspecifier); + } int32 num_done = 0, num_err = 0; - for (; !clat_reader.Done(); clat_reader.Next()) { - std::string key = clat_reader.Key(); - const CompactLattice &clat = clat_reader.Value(); + for (; write_compact ? !clat_reader.Done() : !lat_reader.Done(); + write_compact ? clat_reader.Next() : lat_reader.Next()) { + std::string key = write_compact ? clat_reader.Key() : lat_reader.Key(); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; CompactLattice aligned_clat; - bool ok = PhoneAlignLattice(clat, tmodel, opts, &aligned_clat); + bool ok; + if (write_compact) { + const CompactLattice &clat = clat_reader.Value(); + + ok = PhoneAlignLattice(clat, tmodel, opts, &aligned_clat); + } else { + const Lattice &lat = lat_reader.Value(); + ComputeAcousticScoresMap(lat, &acoustic_scores); + + CompactLattice clat; + fst::ConvertLattice(lat, &clat); + + ok = PhoneAlignLattice(clat, tmodel, opts, &aligned_clat); + } if (!ok) { num_err++; @@ -86,7 +115,18 @@ int main(int argc, char *argv[]) { if (aligned_clat.Start() != fst::kNoStateId) { KALDI_LOG << "Outputting partial lattice for " << key; TopSortCompactLatticeIfNeeded(&aligned_clat); - clat_writer.Write(key, aligned_clat); + + if (write_compact) { + clat_writer.Write(key, aligned_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(aligned_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lat_writer.Write(key, out_lat); + } } } } else { @@ -97,7 +137,18 @@ int main(int argc, char *argv[]) { num_done++; KALDI_VLOG(2) << "Aligned lattice for " << key; TopSortCompactLatticeIfNeeded(&aligned_clat); - clat_writer.Write(key, aligned_clat); + + if (write_compact) { + clat_writer.Write(key, aligned_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(aligned_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lat_writer.Write(key, out_lat); + } } } } diff --git a/src/latbin/lattice-compose.cc b/src/latbin/lattice-compose.cc index df70229bfd8..ad53c534c96 100644 --- a/src/latbin/lattice-compose.cc +++ b/src/latbin/lattice-compose.cc @@ -166,6 +166,7 @@ int main(int argc, char *argv[]) { ConvertLattice(lat_out, &clat_out); compact_lattice_writer.Write(key, clat_out); } else { + fst::TopSort(&lat_out); lattice_writer.Write(key, lat_out); } n_done++; diff --git a/src/latbin/lattice-determinize-phone-pruned-non-compact.cc b/src/latbin/lattice-determinize-phone-pruned-non-compact.cc new file mode 100644 index 00000000000..8b528348914 --- /dev/null +++ b/src/latbin/lattice-determinize-phone-pruned-non-compact.cc @@ -0,0 +1,139 @@ +// latbin/lattice-determinize-phoned-pruned-non-compact.cc + +// Copyright 2014 Guoguo Chen +// 2017 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#include "base/kaldi-common.h" +#include "hmm/transition-model.h" +#include "lat/kaldi-lattice.h" +#include "lat/determinize-lattice-pruned.h" +#include "lat/lattice-functions.h" +#include "lat/push-lattice.h" +#include "util/common-utils.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + typedef kaldi::int32 int32; + + const char *usage = + "Determinize lattices, keeping only the best path (sequence of\n" + "acoustic states) for each input-symbol sequence. This version does\n" + "phone inertion when doing a first pass determinization, it then\n" + "removes the inserted symbols and does a second pass determinization.\n" + "It also does pruning as part of the determinization algorithm, which\n" + "is more efficient and prevents blowup.\n" + "This version retains the acoustic scores on the arcs and writes the " + "output as a regular lattice.\n" + "\n" + "Usage: lattice-determinize-phone-pruned-non-compact [options] \\\n" + " \n" + " e.g.: lattice-determinize-phone-pruned-non-compact --acoustic-scale=0.1 \\\n" + " final.mdl ark:in.lats ark:det.lats\n"; + + ParseOptions po(usage); + BaseFloat acoustic_scale = 1.0; + BaseFloat beam = 10.0; + fst::DeterminizeLatticePhonePrunedOptions opts; + + po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic" + " likelihoods."); + po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling]."); + opts.Register(&po); + po.Read(argc, argv); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + std::string model_rxfilename = po.GetArg(1), + lats_rspecifier = po.GetArg(2), + lats_wspecifier = po.GetArg(3); + + TransitionModel trans_model; + ReadKaldiObject(model_rxfilename, &trans_model); + + SequentialLatticeReader lat_reader(lats_rspecifier); + + LatticeWriter lat_writer(lats_wspecifier); + + int32 n_done = 0, n_warn = 0; + + // depth stats (for diagnostics). + double sum_depth_in = 0.0, + sum_depth_out = 0.0, sum_t = 0.0; + + if (acoustic_scale == 0.0) + KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)"; + + for (; !lat_reader.Done(); lat_reader.Next()) { + std::string key = lat_reader.Key(); + Lattice lat = lat_reader.Value(); + lat_reader.FreeCurrent(); + + KALDI_VLOG(2) << "Processing lattice " << key; + + fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + ComputeAcousticScoresMap(lat, &acoustic_scores); + + CompactLattice det_clat; + if (!DeterminizeLatticePhonePrunedWrapper( + trans_model, &lat, beam, &det_clat, opts)) { + KALDI_WARN << "For key " << key << ", determinization did not succeed" + "(partial output will be pruned tighter than the specified beam.)"; + n_warn++; + } + + int32 t; + TopSortCompactLatticeIfNeeded(&det_clat); + double depth = CompactLatticeDepth(det_clat, &t); + sum_depth_in += lat.NumStates(); + sum_depth_out += depth * t; + sum_t += t; + + Lattice out_lat; + fst::ConvertLattice(det_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + + fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &out_lat); + lat_writer.Write(key, out_lat); + n_done++; + } + + if (sum_t != 0.0) { + KALDI_LOG << "Average input-lattice depth (measured at at state level) is " + << (sum_depth_in / sum_t) << ", output depth is " + << (sum_depth_out / sum_t) << ", over " << sum_t << " frames " + << " (average num-frames = " << (sum_t / n_done) << ")."; + } + KALDI_LOG << "Done " << n_done << " lattices, determinization finished " + << "earlier than specified by the beam (or output was empty) on " + << n_warn << " of these."; + return (n_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/latbin/lattice-determinize-phone-pruned-parallel.cc b/src/latbin/lattice-determinize-phone-pruned-parallel.cc index 6d273d433c6..fa29e7dc8d3 100644 --- a/src/latbin/lattice-determinize-phone-pruned-parallel.cc +++ b/src/latbin/lattice-determinize-phone-pruned-parallel.cc @@ -38,12 +38,20 @@ class DeterminizeLatticeTask { BaseFloat beam, Lattice *lat, CompactLatticeWriter *clat_writer, + LatticeWriter *lat_writer, int32 *num_warn): trans_model_(&trans_model), opts_(opts), key_(key), acoustic_scale_(acoustic_scale), beam_(beam), - lat_(lat), clat_writer_(clat_writer), num_warn_(num_warn) { } + lat_(lat), clat_writer_(clat_writer), + lat_writer_(lat_writer), num_warn_(num_warn) { + KALDI_ASSERT((lat_writer_ && !clat_writer_) || + (!lat_writer_ && clat_writer_)); + } void operator () () { + if (lat_writer_) + ComputeAcousticScoresMap(*lat_, &acoustic_scores_); + // We apply the acoustic scale before determinization and will undo it // afterward, since it can affect the result. fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale_), lat_); @@ -57,16 +65,28 @@ class DeterminizeLatticeTask { delete lat_; lat_ = NULL; - - // Invert the original acoustic scaling - fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale_), - &det_clat_); } ~DeterminizeLatticeTask() { - KALDI_VLOG(2) << "Wrote lattice with " << det_clat_.NumStates() - << " for key " << key_; - clat_writer_->Write(key_, det_clat_); + if (clat_writer_) { + KALDI_VLOG(2) << "Wrote lattice with " << det_clat_.NumStates() + << " for key " << key_; + // Invert the original acoustic scaling + fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale_), + &det_clat_); + clat_writer_->Write(key_, det_clat_); + } else { + KALDI_VLOG(2) << "Wrote lattice with " << det_clat_.NumStates() + << " for key " << key_; + Lattice out_lat; + fst::ConvertLattice(det_clat_, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores_, &out_lat); + + lat_writer_->Write(key_, out_lat); + } } private: const TransitionModel *trans_model_; @@ -80,8 +100,12 @@ class DeterminizeLatticeTask { // destructor. CompactLattice det_clat_; CompactLatticeWriter *clat_writer_; + LatticeWriter *lat_writer_; int32 *num_warn_; - + + // Used to compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores_; }; } // namespace kaldi @@ -107,6 +131,7 @@ int main(int argc, char *argv[]) { " --acoustic-scale=0.1 final.mdl ark:in.lats ark:det.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat acoustic_scale = 1.0; BaseFloat beam = 10.0; @@ -114,6 +139,12 @@ int main(int argc, char *argv[]) { fst::DeterminizeLatticePhonePrunedOptions determinize_opts; determinize_opts.max_mem = 50000000; + po.Register("write-compact", &write_compact, + "If true, write in normal (compact) form. " + "--write-compact=false allows you to retain frame-level " + "acoustic score information, but this requires the input " + "to be in non-compact form e.g. undeterminized lattice " + "straight from decoding."); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic" " likelihoods."); po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling]."); @@ -137,8 +168,13 @@ int main(int argc, char *argv[]) { // accepts. SequentialLatticeReader lat_reader(lats_rspecifier); - // Writes as compact lattice. - CompactLatticeWriter compact_lat_writer(lats_wspecifier); + CompactLatticeWriter *compact_lat_writer = NULL; + LatticeWriter *lat_writer = NULL; + + if (write_compact) + compact_lat_writer = new CompactLatticeWriter(lats_wspecifier); + else + lat_writer = new LatticeWriter(lats_wspecifier); TaskSequencer sequencer(sequencer_opts); @@ -157,7 +193,7 @@ int main(int argc, char *argv[]) { DeterminizeLatticeTask *task = new DeterminizeLatticeTask( trans_model, determinize_opts, key, acoustic_scale, beam, - lat, &compact_lat_writer, &n_warn); + lat, compact_lat_writer, lat_writer, &n_warn); sequencer.Run(task); n_done++; diff --git a/src/latbin/lattice-determinize-pruned-non-compact.cc b/src/latbin/lattice-determinize-pruned-non-compact.cc new file mode 100644 index 00000000000..edf0fac213c --- /dev/null +++ b/src/latbin/lattice-determinize-pruned-non-compact.cc @@ -0,0 +1,157 @@ +// latbin/lattice-determinize-pruned-non-compact.cc + +// Copyright 2013 Daniel Povey (Johns Hopkins University) +// 2017 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "lat/kaldi-lattice.h" +#include "lat/determinize-lattice-pruned.h" +#include "lat/lattice-functions.h" +#include "lat/push-lattice.h" +#include "lat/minimize-lattice.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + typedef kaldi::int32 int32; + + const char *usage = + "Determinize lattices, keeping only the best path (sequence of acoustic states)\n" + "for each input-symbol sequence. This version does pruning as part of the\n" + "determinization algorithm, which is more efficient and prevents blowup.\n" + "See http://kaldi-asr.org/doc/lattices.html for more information on lattices.\n" + "\n" + "Usage: lattice-determinize-pruned [options] lattice-rspecifier lattice-wspecifier\n" + " e.g.: lattice-determinize-pruned --acoustic-scale=0.1 --beam=6.0 ark:in.lats ark:det.lats\n"; + + ParseOptions po(usage); + BaseFloat acoustic_scale = 1.0; + BaseFloat beam = 10.0; + bool minimize = false; + fst::DeterminizeLatticePrunedOptions opts; // Options used in DeterminizeLatticePruned-- + // this options class does not have its own Register function as it's viewed as + // being more part of "fst world", so we register its elements independently. + opts.max_mem = 50000000; + opts.max_loop = 0; // was 500000; + + po.Register("acoustic-scale", &acoustic_scale, + "Scaling factor for acoustic likelihoods"); + po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling]."); + po.Register("minimize", &minimize, + "If true, push and minimize after determinization"); + opts.Register(&po); + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string lats_rspecifier = po.GetArg(1), + lats_wspecifier = po.GetArg(2); + + + // Read as regular lattice-- this is the form the determinization code + // accepts. + SequentialLatticeReader lat_reader(lats_rspecifier); + + // Write as compact lattice. + LatticeWriter lat_writer(lats_wspecifier); + + int32 n_done = 0, n_warn = 0; + + // depth stats (for diagnostics). + double sum_depth_in = 0.0, + sum_depth_out = 0.0, sum_t = 0.0; + + if (acoustic_scale == 0.0) + KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)"; + + for (; !lat_reader.Done(); lat_reader.Next()) { + std::string key = lat_reader.Key(); + Lattice lat = lat_reader.Value(); + + KALDI_VLOG(2) << "Processing lattice " << key; + + fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + ComputeAcousticScoresMap(lat, &acoustic_scores); + + Invert(&lat); // so word labels are on the input side. + lat_reader.FreeCurrent(); + if (!TopSort(&lat)) { + KALDI_WARN << "Could not topologically sort lattice: this probably means it" + " has bad properties e.g. epsilon cycles. Your LM or lexicon might " + "be broken, e.g. LM with epsilon cycles or lexicon with empty words."; + } + fst::ArcSort(&lat, fst::ILabelCompare()); + CompactLattice det_clat; + if (!DeterminizeLatticePruned(lat, beam, &det_clat, opts)) { + KALDI_WARN << "For key " << key << ", determinization did not succeed" + "(partial output will be pruned tighter than the specified beam.)"; + n_warn++; + } + fst::Connect(&det_clat); + if (det_clat.NumStates() == 0) { + KALDI_WARN << "For key " << key << ", determinized and trimmed lattice " + "was empty."; + n_warn++; + } + if (minimize) { + PushCompactLatticeStrings(&det_clat); + PushCompactLatticeWeights(&det_clat); + MinimizeCompactLattice(&det_clat); + } + + int32 t; + TopSortCompactLatticeIfNeeded(&det_clat); + double depth = CompactLatticeDepth(det_clat, &t); + sum_depth_in += lat.NumStates(); + sum_depth_out += depth * t; + sum_t += t; + + Lattice out_lat; + fst::ConvertLattice(det_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + + fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &out_lat); + lat_writer.Write(key, out_lat); + n_done++; + } + + if (sum_t != 0.0) { + KALDI_LOG << "Average input-lattice depth (measured at at state level) is " + << (sum_depth_in / sum_t) << ", output depth is " + << (sum_depth_out / sum_t) << ", over " << sum_t << " frames " + << " (average num-frames = " << (sum_t / n_done) << ")."; + } + KALDI_LOG << "Done " << n_done << " lattices, determinization finished " + << "earlier than specified by the beam (or output was empty) on " + << n_warn << " of these."; + return (n_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/latbin/lattice-expand-ngram.cc b/src/latbin/lattice-expand-ngram.cc index 1b8cfbee24b..6c49fab9daa 100644 --- a/src/latbin/lattice-expand-ngram.cc +++ b/src/latbin/lattice-expand-ngram.cc @@ -20,6 +20,7 @@ #include "util/common-utils.h" #include "fstext/fstext-lib.h" #include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" int main(int argc, char *argv[]) { try { @@ -38,9 +39,11 @@ int main(int argc, char *argv[]) { "e.g.: lattice-expand-ngram --n=3 ark:lat ark:expanded_lat\n"; ParseOptions po(usage); + bool write_compact = true; int32 n = 3; std::string word_syms_filename; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("n", &n, "n-gram context to expand to."); po.Read(argc, argv); @@ -56,33 +59,74 @@ int main(int argc, char *argv[]) { lats_wspecifier = po.GetOptArg(2); fst::UnweightedNgramFst expand_fst(n); + + SequentialCompactLatticeReader compact_lattice_reader; + SequentialLatticeReader lattice_reader; - SequentialCompactLatticeReader lat_reader(lats_rspecifier); - CompactLatticeWriter lat_writer(lats_wspecifier); + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + + if (write_compact) { + compact_lattice_reader.Open(lats_rspecifier); + compact_lattice_writer.Open(lats_wspecifier); + } else { + lattice_reader.Open(lats_rspecifier); + lattice_writer.Open(lats_wspecifier); + } int32 n_done = 0, n_fail = 0; - for (; !lat_reader.Done(); lat_reader.Next()) { - std::string key = lat_reader.Key(); + for (; write_compact ? !compact_lattice_reader.Done() : !lattice_reader.Done(); + write_compact ? compact_lattice_reader.Next() : lattice_reader.Next()) { + std::string key; + CompactLattice clat; + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + KALDI_LOG << "Processing lattice for key " << key; - CompactLattice lat = lat_reader.Value(); + if (write_compact) { + key = compact_lattice_reader.Key(); + clat = compact_lattice_reader.Value(); + compact_lattice_reader.FreeCurrent(); + } else { + key = lattice_reader.Key(); + const Lattice &lat = lattice_reader.Value(); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + ComputeAcousticScoresMap(lat, &acoustic_scores); + + ConvertLattice(lat, &clat); + + lattice_reader.FreeCurrent(); + } CompactLattice expanded_lat; - ComposeDeterministicOnDemand(lat, &expand_fst, &expanded_lat); + ComposeDeterministicOnDemand(clat, &expand_fst, &expanded_lat); if (expanded_lat.Start() == fst::kNoStateId) { KALDI_WARN << "Empty lattice for utterance " << key << std::endl; n_fail++; } else { - if (lat.NumStates() == expanded_lat.NumStates()) { + if (clat.NumStates() == expanded_lat.NumStates()) { KALDI_LOG << "Lattice for key " << key << " did not need to be expanded for order " << n << "."; } else { - KALDI_LOG << "Lattice expanded from " << lat.NumStates() << " to " + KALDI_LOG << "Lattice expanded from " << clat.NumStates() << " to " << expanded_lat.NumStates() << " states for order " << n << "."; } - lat_writer.Write(key, expanded_lat); + if (write_compact) { + compact_lattice_writer.Write(key, expanded_lat); + } else { + Lattice out_lat; + fst::ConvertLattice(expanded_lat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } n_done++; } - lat_reader.FreeCurrent(); } KALDI_LOG << "Processed " << n_done << " lattices with " << n_fail << " failures."; diff --git a/src/latbin/lattice-interp.cc b/src/latbin/lattice-interp.cc index 41e1b32658f..dcd851e5b73 100644 --- a/src/latbin/lattice-interp.cc +++ b/src/latbin/lattice-interp.cc @@ -22,6 +22,7 @@ #include "util/common-utils.h" #include "fstext/fstext-lib.h" #include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" int main(int argc, char *argv[]) { try { @@ -45,9 +46,15 @@ int main(int argc, char *argv[]) { " e.g.: lattice-compose ark:1.lats ark:2.lats ark:composed.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat alpha = 0.5; // Scale of 1st in the pair. + BaseFloat alpha_acoustic = kLogZeroBaseFloat; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("alpha", &alpha, "Scale of the first lattice in the pair (should be in range [0, 1])"); + po.Register("alpha-acoustic", &alpha_acoustic, + "If specified, then alpha will be used for graph scores and " + "alpha_acoustic will be used for acoustic scores (should be in range [0, 1])"); po.Read(argc, argv); if (po.NumArgs() != 3) { @@ -55,6 +62,12 @@ int main(int argc, char *argv[]) { exit(1); } + if (alpha_acoustic == kLogZeroBaseFloat) { + alpha_acoustic = alpha; + } + + KALDI_ASSERT(alpha_acoustic <= 1.0 && alpha_acoustic >= 0.0); + std::string lats_rspecifier1 = po.GetArg(1), lats_rspecifier2 = po.GetArg(2), lats_wspecifier = po.GetArg(3); @@ -62,7 +75,13 @@ int main(int argc, char *argv[]) { SequentialLatticeReader lattice_reader1(lats_rspecifier1); RandomAccessCompactLatticeReader lattice_reader2(lats_rspecifier2); - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + + if (write_compact) + compact_lattice_writer.Open(lats_wspecifier); + else + lattice_writer.Open(lats_wspecifier); int32 n_processed = 0, n_empty = 0, n_success = 0, n_no_2ndlat=0; @@ -70,9 +89,16 @@ int main(int argc, char *argv[]) { std::string key = lattice_reader1.Key(); Lattice lat1 = lattice_reader1.Value(); lattice_reader1.FreeCurrent(); - ScaleLattice(fst::LatticeScale(alpha, alpha), &lat1); + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + if (!write_compact) + ComputeAcousticScoresMap(lat1, &acoustic_scores); + ScaleLattice(fst::LatticeScale(alpha, alpha_acoustic), &lat1); + ArcSort(&lat1, fst::OLabelCompare()); + if (lattice_reader2.HasKey(key)) { n_processed++; CompactLattice clat2 = lattice_reader2.Value(key); @@ -81,7 +107,7 @@ int main(int argc, char *argv[]) { Lattice lat2; ConvertLattice(clat2, &lat2); fst::Project(&lat2, fst::PROJECT_OUTPUT); // project on words. - ScaleLattice(fst::LatticeScale(1.0-alpha, 1.0-alpha), &lat2); + ScaleLattice(fst::LatticeScale(1.0-alpha, 1.0-alpha_acoustic), &lat2); ArcSort(&lat2, fst::ILabelCompare()); Lattice lat3; @@ -91,9 +117,16 @@ int main(int argc, char *argv[]) { n_empty++; } else { n_success++; - CompactLattice clat3; - ConvertLattice(lat3, &clat3); - compact_lattice_writer.Write(key, clat3); + if (write_compact) { + CompactLattice clat3; + ConvertLattice(lat3, &clat3); + compact_lattice_writer.Write(key, clat3); + } else { + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &lat3); + lattice_writer.Write(key, lat3); + } } } else { KALDI_WARN << "No lattice found for utterance " << key << " in " diff --git a/src/latbin/lattice-lmrescore-const-arpa.cc b/src/latbin/lattice-lmrescore-const-arpa.cc index 789f0fb8d4e..4613e805b8a 100644 --- a/src/latbin/lattice-lmrescore-const-arpa.cc +++ b/src/latbin/lattice-lmrescore-const-arpa.cc @@ -44,8 +44,10 @@ int main(int argc, char *argv[]) { " const_arpa ark:out.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat lm_scale = 1.0; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("lm-scale", &lm_scale, "Scaling factor for language model " "costs; frequently 1.0 or -1.0"); @@ -65,14 +67,45 @@ int main(int argc, char *argv[]) { ReadKaldiObject(lm_rxfilename, &const_arpa); // Reads and writes as compact lattice. - SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); + SequentialCompactLatticeReader compact_lattice_reader; + CompactLatticeWriter compact_lattice_writer; + + SequentialLatticeReader lattice_reader; + LatticeWriter lattice_writer; + + if (write_compact) { + compact_lattice_reader.Open(lats_rspecifier); + compact_lattice_writer.Open(lats_wspecifier); + } else { + lattice_reader.Open(lats_rspecifier); + lattice_writer.Open(lats_wspecifier); + } int32 n_done = 0, n_fail = 0; - for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { - std::string key = compact_lattice_reader.Key(); - CompactLattice clat = compact_lattice_reader.Value(); - compact_lattice_reader.FreeCurrent(); + for (; write_compact ? !compact_lattice_reader.Done() : !lattice_reader.Done(); + write_compact ? compact_lattice_reader.Next() : lattice_reader.Next()) { + std::string key = write_compact ? compact_lattice_reader.Key() : lattice_reader.Key(); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + + CompactLattice clat; + if (write_compact) { + clat = compact_lattice_reader.Value(); + compact_lattice_reader.FreeCurrent(); + } else { + const Lattice &lat = lattice_reader.Value(); + + if (lm_scale == 0.0) { + lattice_writer.Write(key, lat); + continue; + } + + ComputeAcousticScoresMap(lat, &acoustic_scores); + fst::ConvertLattice(lat, &clat); + lattice_reader.FreeCurrent(); + } if (lm_scale != 0.0) { // Before composing with the LM FST, we scale the lattice weights @@ -104,13 +137,34 @@ int main(int argc, char *argv[]) { << " (incompatible LM?)"; n_fail++; } else { - compact_lattice_writer.Write(key, determinized_clat); + if (write_compact) { + compact_lattice_writer.Write(key, determinized_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(determinized_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } n_done++; } } else { // Zero scale so nothing to do. n_done++; - compact_lattice_writer.Write(key, clat); + + if (write_compact) { + compact_lattice_writer.Write(key, clat); + } else { + Lattice out_lat; + fst::ConvertLattice(clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } } } diff --git a/src/latbin/lattice-lmrescore.cc b/src/latbin/lattice-lmrescore.cc index 2e5406f75de..d9367a55480 100644 --- a/src/latbin/lattice-lmrescore.cc +++ b/src/latbin/lattice-lmrescore.cc @@ -1,4 +1,4 @@ -// latbin/lattice-lmrescore.cc +//latbin/lattice-lmrescore.cc // Copyright 2009-2011 Microsoft Corporation // 2014 Johns Hopkins University (author: Daniel Povey) @@ -24,6 +24,7 @@ #include "fstext/fstext-lib.h" #include "fstext/kaldi-fst-io.h" #include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" int main(int argc, char *argv[]) { try { @@ -43,9 +44,11 @@ int main(int argc, char *argv[]) { " e.g.: lattice-lmrescore --lm-scale=-1.0 ark:in.lats 'fstproject --project_output=true data/lang/G.fst|' ark:out.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat lm_scale = 1.0; int32 num_states_cache = 50000; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("lm-scale", &lm_scale, "Scaling factor for language model costs; frequently 1.0 or -1.0"); po.Register("num-states-cache", &num_states_cache, "Number of states we cache when mapping LM FST to lattice type. " @@ -99,8 +102,13 @@ int main(int argc, char *argv[]) { // composition and determinization. SequentialLatticeReader lattice_reader(lats_rspecifier); - // Write as compact lattice. - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + + if (write_compact) + compact_lattice_writer.Open(lats_wspecifier); + else + lattice_writer.Open(lats_wspecifier); int32 n_done = 0, n_fail = 0; @@ -108,6 +116,13 @@ int main(int argc, char *argv[]) { std::string key = lattice_reader.Key(); Lattice lat = lattice_reader.Value(); lattice_reader.FreeCurrent(); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + if (!write_compact) + ComputeAcousticScoresMap(lat, &acoustic_scores); + if (lm_scale != 0.0) { // Only need to modify it if LM scale nonzero. // Before composing with the LM FST, we scale the lattice weights @@ -126,22 +141,38 @@ int main(int argc, char *argv[]) { TableCompose(lat, lm_fst, &composed_lat, &lm_compose_cache); Invert(&composed_lat); // make it so word labels are on the input. - CompactLattice determinized_lat; - DeterminizeLattice(composed_lat, &determinized_lat); - fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_lat); - if (determinized_lat.Start() == fst::kNoStateId) { - KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; + + CompactLattice determinized_clat; + DeterminizeLattice(composed_lat, &determinized_clat); + fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_clat); + if (determinized_clat.Start() == fst::kNoStateId) { + KALDI_WARN << "Empty lattice for utterance " << key + << " (incompatible LM?)"; n_fail++; } else { - compact_lattice_writer.Write(key, determinized_lat); + if (write_compact) { + compact_lattice_writer.Write(key, determinized_clat); + } else { + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + Lattice out_lat; + ConvertLattice(determinized_clat, &out_lat); + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } n_done++; } } else { - // zero scale so nothing to do. + // Zero scale so nothing to do. n_done++; - CompactLattice compact_lat; - ConvertLattice(lat, &compact_lat); - compact_lattice_writer.Write(key, compact_lat); + + if (write_compact) { + CompactLattice compact_lat; + ConvertLattice(lat, &compact_lat); + compact_lattice_writer.Write(key, compact_lat); + } else { + lattice_writer.Write(key, lat); + } } } diff --git a/src/latbin/lattice-prune.cc b/src/latbin/lattice-prune.cc index 49399f748e4..993eea41145 100644 --- a/src/latbin/lattice-prune.cc +++ b/src/latbin/lattice-prune.cc @@ -40,10 +40,12 @@ int main(int argc, char *argv[]) { " e.g.: lattice-prune --acoustic-scale=0.1 --beam=4.0 ark:1.lats ark:pruned.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat acoustic_scale = 1.0; BaseFloat inv_acoustic_scale = 1.0; BaseFloat beam = 10.0; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods"); po.Register("inv-acoustic-scale", &inv_acoustic_scale, "An alternative way of setting the " "acoustic scale: you can set its inverse."); @@ -63,10 +65,18 @@ int main(int argc, char *argv[]) { std::string lats_rspecifier = po.GetArg(1), lats_wspecifier = po.GetArg(2); - + SequentialCompactLatticeReader compact_lattice_reader; + CompactLatticeWriter compact_lattice_writer; + SequentialLatticeReader lattice_reader; + LatticeWriter lattice_writer; - SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); + if (write_compact) { + compact_lattice_reader.Open(lats_rspecifier); + compact_lattice_writer.Open(lats_wspecifier); + } else { + lattice_reader.Open(lats_rspecifier); + lattice_writer.Open(lats_wspecifier); + } int32 n_done = 0, n_err = 0; int64 n_arcs_in = 0, n_arcs_out = 0, @@ -75,10 +85,25 @@ int main(int argc, char *argv[]) { if (acoustic_scale == 0.0) KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)"; - for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { - std::string key = compact_lattice_reader.Key(); - CompactLattice clat = compact_lattice_reader.Value(); - compact_lattice_reader.FreeCurrent(); + for (; write_compact ? !compact_lattice_reader.Done() : !lattice_reader.Done(); + write_compact ? compact_lattice_reader.Next() : lattice_reader.Next()) { + std::string key = write_compact ? compact_lattice_reader.Key() : lattice_reader.Key(); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + + CompactLattice clat; + if (write_compact) { + clat = compact_lattice_reader.Value(); + compact_lattice_reader.FreeCurrent(); + } else { + const Lattice &lat = lattice_reader.Value(); + ComputeAcousticScoresMap(lat, &acoustic_scores); + + fst::ConvertLattice(lat, &clat); + lattice_reader.FreeCurrent(); + } fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &clat); int64 narcs = NumArcs(clat), nstates = clat.NumStates(); n_arcs_in += narcs; @@ -96,7 +121,18 @@ int main(int argc, char *argv[]) { << nstates << " to " << pruned_nstates << " and #arcs from " << narcs << " to " << pruned_narcs; fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &pruned_clat); - compact_lattice_writer.Write(key, pruned_clat); + + if (write_compact) { + compact_lattice_writer.Write(key, pruned_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(pruned_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } n_done++; } diff --git a/src/latbin/lattice-to-fst.cc b/src/latbin/lattice-to-fst.cc index 0d2ac29a99b..19f8bf453c1 100644 --- a/src/latbin/lattice-to-fst.cc +++ b/src/latbin/lattice-to-fst.cc @@ -22,6 +22,50 @@ #include "util/common-utils.h" #include "fstext/fstext-lib.h" #include "lat/kaldi-lattice.h" +#include "hmm/transition-model.h" + +namespace kaldi { + +void ConvertLatticeToPdfLabels( + const TransitionModel &tmodel, + const Lattice &ifst, + fst::StdVectorFst *ofst) { + typedef fst::ArcTpl ArcIn; + typedef fst::StdArc ArcOut; + typedef ArcIn::StateId StateId; + ofst->DeleteStates(); + // The states will be numbered exactly the same as the original FST. + // Add the states to the new FST. + StateId num_states = ifst.NumStates(); + for (StateId s = 0; s < num_states; s++) { + StateId news = ofst->AddState(); + assert(news == s); + } + ofst->SetStart(ifst.Start()); + for (StateId s = 0; s < num_states; s++) { + LatticeWeight final_iweight = ifst.Final(s); + if (final_iweight != LatticeWeight::Zero()) { + fst::TropicalWeight final_oweight; + ConvertLatticeWeight(final_iweight, &final_oweight); + ofst->SetFinal(s, final_oweight); + } + for (fst::ArcIterator iter(ifst, s); + !iter.Done(); + iter.Next()) { + ArcIn arc = iter.Value(); + KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero()); + ArcOut oarc; + ConvertLatticeWeight(arc.weight, &oarc.weight); + oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; + oarc.olabel = arc.olabel; + oarc.nextstate = arc.nextstate; + ofst->AddArc(s, oarc); + } + } +} + +} + int main(int argc, char *argv[]) { try { @@ -34,20 +78,33 @@ int main(int argc, char *argv[]) { using std::vector; BaseFloat acoustic_scale = 0.0; BaseFloat lm_scale = 0.0; - bool rm_eps = true; - + bool rm_eps = true, read_compact = true, convert_to_pdf_labels = false; + std::string trans_model; + bool project_input = false, project_output = true; + const char *usage = "Turn lattices into normal FSTs, retaining only the word labels\n" "By default, removes all weights and also epsilons (configure with\n" "with --acoustic-scale, --lm-scale and --rm-eps)\n" "Usage: lattice-to-fst [options] lattice-rspecifier fsts-wspecifier\n" " e.g.: lattice-to-fst ark:1.lats ark:1.fsts\n"; - + ParseOptions po(usage); + po.Register("read-compact", &read_compact, "Read compact lattice"); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods"); po.Register("lm-scale", &lm_scale, "Scaling factor for graph/lm costs"); po.Register("rm-eps", &rm_eps, "Remove epsilons in resulting FSTs (in lazy way; may not remove all)"); - + po.Register("convert-to-pdf-labels", &convert_to_pdf_labels, + "Convert lattice to pdf labels"); + po.Register("trans-model", &trans_model, + "Transition model"); + po.Register("project-input", &project_input, + "Project to input labels (transition-ids); applicable only " + "when --read-compact=false"); + po.Register("project-output", &project_output, + "Project to output labels (transition-ids); applicable only " + "when --read-compact=false"); + po.Read(argc, argv); if (po.NumArgs() != 2) { @@ -56,35 +113,74 @@ int main(int argc, char *argv[]) { } vector > scale = fst::LatticeScale(lm_scale, acoustic_scale); - + std::string lats_rspecifier = po.GetArg(1), fsts_wspecifier = po.GetArg(2); - - SequentialCompactLatticeReader lattice_reader(lats_rspecifier); + + TransitionModel tmodel; + if (!trans_model.empty()) { + ReadKaldiObject(trans_model, &tmodel); + } + + SequentialCompactLatticeReader compact_lattice_reader; + SequentialLatticeReader lattice_reader; + TableWriter fst_writer(fsts_wspecifier); - + int32 n_done = 0; // there is no failure mode, barring a crash. - for (; !lattice_reader.Done(); lattice_reader.Next()) { - std::string key = lattice_reader.Key(); - CompactLattice clat = lattice_reader.Value(); - lattice_reader.FreeCurrent(); - ScaleLattice(scale, &clat); // typically scales to zero. - RemoveAlignmentsFromCompactLattice(&clat); // remove the alignments... - fst::VectorFst fst; - { - Lattice lat; - ConvertLattice(clat, &lat); // convert to non-compact form.. won't introduce - // extra states because already removed alignments. - ConvertLattice(lat, &fst); // this adds up the (lm,acoustic) costs to get - // the normal (tropical) costs. - Project(&fst, fst::PROJECT_OUTPUT); // Because in the standard Lattice format, - // the words are on the output, and we want the word labels. + if (read_compact) { + SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); + for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { + std::string key = compact_lattice_reader.Key(); + CompactLattice clat = compact_lattice_reader.Value(); + compact_lattice_reader.FreeCurrent(); + ScaleLattice(scale, &clat); // typically scales to zero. + RemoveAlignmentsFromCompactLattice(&clat); // remove the alignments... + fst::VectorFst fst; + { + Lattice lat; + ConvertLattice(clat, &lat); // convert to non-compact form.. won't introduce + // extra states because already removed alignments. + + if (convert_to_pdf_labels) { + ConvertLatticeToPdfLabels(tmodel, lat, &fst); // this adds up the (lm,acoustic) costs to get + // the normal (tropical) costs. + } else { + ConvertLattice(lat, &fst); + } + + Project(&fst, fst::PROJECT_OUTPUT); // Because in the standard compact_lattice format, + // the words are on the output, and we want the word labels. + } + if (rm_eps) RemoveEpsLocal(&fst); + + fst_writer.Write(key, fst); + n_done++; + } + } else { + SequentialLatticeReader lattice_reader(lats_rspecifier); + for (; !lattice_reader.Done(); lattice_reader.Next()) { + std::string key = lattice_reader.Key(); + Lattice lat = lattice_reader.Value(); + lattice_reader.FreeCurrent(); + ScaleLattice(scale, &lat); // typically scales to zero. + fst::VectorFst fst; + if (convert_to_pdf_labels) { + ConvertLatticeToPdfLabels(tmodel, lat, &fst); + } else { + ConvertLattice(lat, &fst); + } + if (project_input) + Project(&fst, fst::PROJECT_INPUT); + else if (project_output) + Project(&fst, fst::PROJECT_OUTPUT); + if (rm_eps) RemoveEpsLocal(&fst); + + fst_writer.Write(key, fst); + n_done++; } - if (rm_eps) RemoveEpsLocal(&fst); - - fst_writer.Write(key, fst); - n_done++; + } KALDI_LOG << "Done converting " << n_done << " lattices to word-level FSTs"; return (n_done != 0 ? 0 : 1); diff --git a/src/latbin/lattice-to-phone-lattice.cc b/src/latbin/lattice-to-phone-lattice.cc index 10da2b47bf1..749435d3bf6 100644 --- a/src/latbin/lattice-to-phone-lattice.cc +++ b/src/latbin/lattice-to-phone-lattice.cc @@ -49,6 +49,8 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); bool replace_words = true; + bool write_compact = true; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("replace-words", &replace_words, "If true, replace words with phones; otherwise replace " "transition-ids with phones."); @@ -70,26 +72,37 @@ int main(int argc, char *argv[]) { ReadKaldiObject(model_rxfilename, &trans_model); - SequentialCompactLatticeReader clat_reader(lats_rspecifier); - CompactLatticeWriter clat_writer(lats_wspecifier); // write as compact. - for (; !clat_reader.Done(); clat_reader.Next()) { - if (replace_words) { - Lattice lat; - ConvertLattice(clat_reader.Value(), &lat); - ConvertLatticeToPhones(trans_model, &lat); // this function replaces words -> phones - CompactLattice clat; - ConvertLattice(lat, &clat); - clat_writer.Write(clat_reader.Key(), clat); - } else { // replace transition-ids with phones. - CompactLattice clat(clat_reader.Value()); - ConvertCompactLatticeToPhones(trans_model, &clat); - // this function replaces transition-ids with phones. We do it in the - // CompactLattice form, in order to preserve the alignment of - // transition-id sequences/phones-sequences to words [e.g. if you just - // did lattice-align-words]. - clat_writer.Write(clat_reader.Key(), clat); + if (write_compact) { + SequentialCompactLatticeReader clat_reader(lats_rspecifier); + CompactLatticeWriter clat_writer(lats_wspecifier); + for (; !clat_reader.Done(); !clat_reader.Done()) { + if (replace_words) { + Lattice lat; + ConvertLattice(clat_reader.Value(), &lat); + ConvertLatticeToPhones(trans_model, &lat); // this function replaces words -> phones + CompactLattice clat; + ConvertLattice(lat, &clat); + clat_writer.Write(clat_reader.Key(), clat); + } else { // replace transition-ids with phones. + CompactLattice clat(clat_reader.Value()); + ConvertCompactLatticeToPhones(trans_model, &clat); + // this function replaces transition-ids with phones. We do it in the + // CompactLattice form, in order to preserve the alignment of + // transition-id sequences/phones-sequences to words [e.g. if you just + // did lattice-align-words]. + clat_writer.Write(clat_reader.Key(), clat); + } + n_done++; + } + } else { + SequentialLatticeReader lat_reader(lats_rspecifier); + LatticeWriter lat_writer(lats_wspecifier); + for (; !lat_reader.Done(); !lat_reader.Done()) { + Lattice lat(lat_reader.Value()); + ConvertLatticeToPhones(trans_model, &lat, replace_words); // this function replaces words -> phones + lat_writer.Write(lat_reader.Key(), lat); + n_done++; } - n_done++; } KALDI_LOG << "Done converting " << n_done << " lattices."; return (n_done != 0 ? 0 : 1); diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc index fcfe0616b64..fe2610ad44d 100644 --- a/src/matrix/kaldi-matrix.cc +++ b/src/matrix/kaldi-matrix.cc @@ -1417,6 +1417,23 @@ void MatrixBase::Write(std::ostream &os, bool binary) const { } } +template +std::string MatrixBase::ToStr() const { + std::ostringstream oss; + if (num_cols_ == 0) { + oss << " [ ]\n"; + } else { + oss << " ["; + for (MatrixIndexT i = 0; i < num_rows_; i++) { + oss << "\n "; + for (MatrixIndexT j = 0; j < num_cols_; j++) + oss << (*this)(i, j) << " "; + } + oss << "]\n"; + } + return oss.str(); +} + template void MatrixBase::Read(std::istream & is, bool binary, bool add) { diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h index a973824128c..639ffa75194 100644 --- a/src/matrix/kaldi-matrix.h +++ b/src/matrix/kaldi-matrix.h @@ -718,6 +718,8 @@ class MatrixBase { /// write to stream. void Write(std::ostream & out, bool binary) const; + std::string ToStr() const; + // Below is internal methods for Svd, user does not have to know about this. #if !defined(HAVE_ATLAS) && !defined(USE_KALDI_SVD) // protected: diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc index 55d8edeb4b3..d1f5de54cd3 100644 --- a/src/matrix/sparse-matrix.cc +++ b/src/matrix/sparse-matrix.cc @@ -654,26 +654,49 @@ void SparseMatrix::Resize(MatrixIndexT num_rows, template void SparseMatrix::AppendSparseMatrixRows( - std::vector > *inputs) { + std::vector > *inputs, + bool sort_by_t) { rows_.clear(); size_t num_rows = 0; typename std::vector >::iterator input_iter = inputs->begin(), input_end = inputs->end(); - for (; input_iter != input_end; ++input_iter) + int32 local_row_size = input_iter->rows_.size(), + num_inputs = inputs->size(); + for (; input_iter != input_end; ++input_iter) { num_rows += input_iter->rows_.size(); + if (sort_by_t) + if (input_iter->rows_.size() != local_row_size) + KALDI_ERR << "we can not append sparse matrices with inconsistent " + << " number of rows, if sort_by_t is true"; + } rows_.resize(num_rows); typename std::vector >::iterator row_iter = rows_.begin(), row_end = rows_.end(); - for (input_iter = inputs->begin(); input_iter != input_end; ++input_iter) { - typename std::vector >::iterator - input_row_iter = input_iter->rows_.begin(), - input_row_end = input_iter->rows_.end(); - for (; input_row_iter != input_row_end; ++input_row_iter, ++row_iter) - row_iter->Swap(&(*input_row_iter)); + if (sort_by_t) { + // If true, the matrices appended to be sorted first by original row index (t) and next by matrix order in input. + // i.e. all rows with same index in local input matrix are appended in a same block. + int32 n = 0, t = 0; // 'n' is index over matrices and 't' is index for rows in matrixes. + for (input_iter = inputs->begin(); input_iter != input_end; ++input_iter, ++n) { + typename std::vector >::iterator + input_row_iter = input_iter->rows_.begin(), + input_row_end = input_iter->rows_.end(); + for (t = 0; input_row_iter != input_row_end; ++input_row_iter, ++t) { + int32 src_row_index = n + t * num_inputs; + rows_[src_row_index].Swap(&(*input_row_iter)); + } + } + } else { + for (input_iter = inputs->begin(); input_iter != input_end; ++input_iter) { + typename std::vector >::iterator + input_row_iter = input_iter->rows_.begin(), + input_row_end = input_iter->rows_.end(); + for (; input_row_iter != input_row_end; ++input_row_iter, ++row_iter) + row_iter->Swap(&(*input_row_iter)); + } + KALDI_ASSERT(row_iter == row_end); } - KALDI_ASSERT(row_iter == row_end); int32 num_cols = NumCols(); for (row_iter = rows_.begin(); row_iter != row_end; ++row_iter) { if (row_iter->Dim() != num_cols) @@ -926,7 +949,8 @@ void GeneralMatrix::Read(std::istream &is, bool binary) { void AppendGeneralMatrixRows(const std::vector &src, - GeneralMatrix *mat) { + GeneralMatrix *mat, + bool sort_by_t) { mat->Clear(); int32 size = src.size(); if (size == 0) @@ -943,7 +967,7 @@ void AppendGeneralMatrixRows(const std::vector &src, for (int32 i = 0; i < size; i++) sparse_mats[i] = src[i]->GetSparseMatrix(); SparseMatrix appended_mat; - appended_mat.AppendSparseMatrixRows(&sparse_mats); + appended_mat.AppendSparseMatrixRows(&sparse_mats, sort_by_t); mat->SwapSparseMatrix(&appended_mat); } else { int32 tot_rows = 0, num_cols = -1; @@ -958,17 +982,43 @@ void AppendGeneralMatrixRows(const std::vector &src, << num_cols << " vs. " << src_cols; } } - Matrix appended_mat(tot_rows, num_cols, kUndefined); + Matrix appended_mat(tot_rows, num_cols); + Matrix appended_mat_check(tot_rows, num_cols, kUndefined); + int32 row_offset = 0; - for (int32 i = 0; i < size; i++) { - const GeneralMatrix &src_mat = *(src[i]); - int32 src_rows = src_mat.NumRows(); - if (src_rows != 0) { - SubMatrix dest_submat(appended_mat, row_offset, src_rows, - 0, num_cols); - src_mat.CopyToMat(&dest_submat); + if (sort_by_t) { + // reorder the src mat rows to be inserted in appended matrix, in order to + // have sorted matrix first by 't' and next by 'n'. + int32 local_row_size = src[0]->NumRows(); + for (int32 i = 0; i < size; i++) { + const GeneralMatrix &src_mat = *(src[i]); + Matrix full_src_mat(src_mat.NumRows(), src_mat.NumCols()); + src_mat.CopyToMat(&full_src_mat); + int32 src_rows = src_mat.NumRows(); + if (src_rows != local_row_size) + KALDI_ERR << "Appending rows of matrices with inconsistent num-rows " + << "with sort-by-t=true is not possible:"; + std::vector reorder_indexes(local_row_size); + for (int32 j = 0; j < local_row_size; j++) { + reorder_indexes[j] = j * size + i; + appended_mat_check.Row(j * size + i).CopyFromVec(full_src_mat.Row(j)); + } + full_src_mat.AddToRows(1.0, &(reorder_indexes[0]), &appended_mat); row_offset += src_rows; } + + KALDI_ASSERT(appended_mat.ApproxEqual(appended_mat_check)); + } else { + for (int32 i = 0; i < size; i++) { + const GeneralMatrix &src_mat = *(src[i]); + int32 src_rows = src_mat.NumRows(); + if (src_rows != 0) { + SubMatrix dest_submat(appended_mat, row_offset, src_rows, + 0, num_cols); + src_mat.CopyToMat(&dest_submat); + row_offset += src_rows; + } + } } KALDI_ASSERT(row_offset == tot_rows); mat->SwapFullMatrix(&appended_mat); diff --git a/src/matrix/sparse-matrix.h b/src/matrix/sparse-matrix.h index 76f77f531d5..35efebdabd7 100644 --- a/src/matrix/sparse-matrix.h +++ b/src/matrix/sparse-matrix.h @@ -210,7 +210,13 @@ class SparseMatrix { /// function is destructive of the inputs. Requires, obviously, /// that the inputs all have the same dimension (although some may be /// empty). - void AppendSparseMatrixRows(std::vector > *inputs); + /// + /// If sort_by_t is true, all sparse matrixes are appended in a way to be sorted + /// w.r.t their local row indexes and then sorted with matrix index. + /// i.e. all rows of matrixes with same index are in same block. + /// Also number of rows in all matrixes needs to be equal. + void AppendSparseMatrixRows(std::vector > *inputs, + bool sort_by_t=false); SparseMatrix() { } @@ -392,7 +398,8 @@ class GeneralMatrix { /// Does not preserve compression, if inputs were compressed; you have to /// re-compress manually, if that's what you need. void AppendGeneralMatrixRows(const std::vector &src, - GeneralMatrix *mat); + GeneralMatrix *mat, + bool sort_by_t = false); /// Outputs a SparseMatrix containing only the rows r of "in" such that diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 81f19c44b5c..0d8f5d5b961 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -23,6 +23,46 @@ namespace kaldi { namespace nnet3 { +void NnetChainComputeProb::ParseObjectiveOpts( + const chain::ChainTrainingOptions &chain_config) { + if (!chain_config.silence_pdfs_str.empty()) { + std::vector silence_pdfs; + SplitStringToVector(chain_config.silence_pdfs_str, ":,", false, + &silence_pdfs); + + int32 num_pdfs = nnet_.OutputDim("output"); + std::vector indices(num_pdfs); + for (size_t i = 0; i < num_pdfs; i++) { + indices[i] = i; + } + + for (std::vector::iterator it = silence_pdfs.begin(); + it != silence_pdfs.end(); ++it) { + int32 pdf = std::atoi(it->c_str()); + if (pdf > num_pdfs) + KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " + << chain_config.silence_pdfs_str; + indices[pdf] = -1; + } + + sil_indices_.Resize(num_pdfs); + sil_indices_.CopyFromVec(indices); + } + + if (!chain_config.smbr_factors_str.empty()) + ParseObjectiveScales(chain_config.smbr_factors_str, + &smbr_factors_); + if (!chain_config.mmi_factors_str.empty()) + ParseObjectiveScales(chain_config.mmi_factors_str, + &mmi_factors_); + if (!chain_config.ml_factors_str.empty()) + ParseObjectiveScales(chain_config.ml_factors_str, + &ml_factors_); + if (!chain_config.kl_factors_str.empty()) + ParseObjectiveScales(chain_config.kl_factors_str, + &kl_factors_); +} + NnetChainComputeProb::NnetChainComputeProb( const NnetComputeProbOptions &nnet_config, const chain::ChainTrainingOptions &chain_config, @@ -44,8 +84,9 @@ NnetChainComputeProb::NnetChainComputeProb( KALDI_ERR << "If you set store_component_stats == true and " << "compute_deriv == false, use the other constructor."; } -} + ParseObjectiveOpts(chain_config); +} NnetChainComputeProb::NnetChainComputeProb( const NnetComputeProbOptions &nnet_config, @@ -62,8 +103,9 @@ NnetChainComputeProb::NnetChainComputeProb( num_minibatches_processed_(0) { KALDI_ASSERT(den_graph_.NumPdfs() > 0); KALDI_ASSERT(nnet_config.store_component_stats && !nnet_config.compute_deriv); -} + ParseObjectiveOpts(chain_config); +} const Nnet &NnetChainComputeProb::GetDeriv() const { if (!nnet_config_.compute_deriv) @@ -126,7 +168,36 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, KALDI_ERR << "Network has no output named " << sup.name; const CuMatrixBase &nnet_output = computer->GetOutput(sup.name); - bool use_xent = (chain_config_.xent_regularize != 0.0); + + chain::ChainTrainingOptions chain_config_copy(chain_config_); + + { + auto it = smbr_factors_.find(sup.name); + if (it != smbr_factors_.end()) + chain_config_copy.smbr_factor = it->second; + + if (chain_config_copy.smbr_factor > 0.0 && !chain_config_copy.use_smbr_objective) + KALDI_ERR << "smbr factor for " << sup.name << " = " + << chain_config_copy.smbr_factor + << " > 0.0, but --use-smbr-objective=false"; + } + { + auto it = mmi_factors_.find(sup.name); + if (it != mmi_factors_.end()) + chain_config_copy.mmi_factor = it->second; + } + { + auto it = ml_factors_.find(sup.name); + if (it != ml_factors_.end()) + chain_config_copy.ml_factor = it->second; + } + { + auto it = kl_factors_.find(sup.name); + if (it != kl_factors_.end()) + chain_config_copy.kl_factor = it->second; + } + + bool use_xent = (chain_config_copy.xent_regularize != 0.0); std::string xent_name = sup.name + "-xent"; // typically "output-xent". CuMatrix nnet_output_deriv, xent_deriv; if (nnet_config_.compute_deriv) @@ -136,13 +207,41 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(), kUndefined); - BaseFloat tot_like, tot_l2_term, tot_weight; + BaseFloat tot_like, tot_mmi_objf, tot_l2_term, tot_weight; + + if (chain_config_copy.kl_factor > 0.0) { + KALDI_ASSERT(chain_config_copy.smbr_factor == 0.0); + if (!chain_config_copy.self_kl) + KALDI_ASSERT(sup.supervision.numerator_post_targets.NumRows() > 0); + } + + if (chain_config_copy.smbr_factor > 0.0) { + ComputeChainSmbrObjfAndDeriv( + chain_config_copy, den_graph_, + sup.supervision, nnet_output, + &tot_like, &tot_mmi_objf, &tot_l2_term, &tot_weight, + (nnet_config_.compute_deriv ? &nnet_output_deriv : + NULL), (use_xent ? &xent_deriv : NULL), + sil_indices_.Dim() ? &sil_indices_ : NULL); + } else { + ComputeChainObjfAndDeriv(chain_config_copy, den_graph_, + sup.supervision, nnet_output, + &tot_like, &tot_l2_term, &tot_weight, + (nnet_config_.compute_deriv ? &nnet_output_deriv : + NULL), (use_xent ? &xent_deriv : NULL)); - ComputeChainObjfAndDeriv(chain_config_, den_graph_, - sup.supervision, nnet_output, - &tot_like, &tot_l2_term, &tot_weight, - (nnet_config_.compute_deriv ? &nnet_output_deriv : - NULL), (use_xent ? &xent_deriv : NULL)); + if (chain_config_copy.self_kl) { + const CuMatrixBase &teacher_nnet_output = + computer->GetOutput(sup.name + "-teacher"); + + BaseFloat num_objf = 0, num_weight = 0.0; + ComputeChainDenominatorObjfAndDeriv(chain_config_copy, den_graph_, teacher_nnet_output, + sup.supervision.weight, sup.supervision.num_sequences, + &num_objf, &num_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL)); + } + } // note: in this context we don't want to apply 'sup.deriv_weights' because // this code is used only in combination, where it's part of an L-BFGS @@ -152,10 +251,32 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, // and conjugate gradient descent both rely on the derivatives being // accurate, and don't fail gracefully if the derivatives are not accurate). - ChainObjectiveInfo &totals = objf_info_[sup.name]; - totals.tot_weight += tot_weight; - totals.tot_like += tot_like; - totals.tot_l2_term += tot_l2_term; + std::vector aux_objfs; + aux_objfs.push_back(tot_l2_term); + if (chain_config_copy.smbr_factor > 0.0) + aux_objfs.push_back(tot_mmi_objf); + + { + unordered_map::iterator it + = objf_info_.find(sup.name); + + if (it == objf_info_.end()) { + BaseFloat this_objf_scale = 1.0; + std::vector aux_objf_scales(1, 1.0); // l2_term + if (chain_config_copy.smbr_factor > 0.0) { + this_objf_scale *= chain_config_copy.smbr_factor; + aux_objf_scales.push_back( + (chain_config_copy.mmi_factor + chain_config_copy.ml_factor)); + } + + ChainObjectiveInfo totals(this_objf_scale, aux_objf_scales); + it = objf_info_.insert(it, std::make_pair(sup.name, totals)); + } + + it->second.tot_weight += tot_weight; + it->second.tot_like += tot_like; + it->second.tot_aux_objfs.Add(aux_objfs); + } if (nnet_config_.compute_deriv) computer->AcceptInput(sup.name, &nnet_output_deriv); @@ -187,10 +308,18 @@ bool NnetChainComputeProb::PrintTotalStats() const { int32 node_index = nnet_.GetNodeIndex(name); KALDI_ASSERT(node_index >= 0); const ChainObjectiveInfo &info = iter->second; - BaseFloat like = (info.tot_like / info.tot_weight), - l2_term = (info.tot_l2_term / info.tot_weight), - tot_objf = like + l2_term; - if (info.tot_l2_term == 0.0) { + BaseFloat like = (info.tot_like / info.tot_weight); + + ObjectiveValues aux_objfs(info.tot_aux_objfs); + aux_objfs.InvScale(info.tot_weight); + BaseFloat tot_objf = like + aux_objfs.Sum(); + + // Remove scales for the purpose of printing + if (info.objf_scale != 0.0) like /= info.objf_scale; + if (info.aux_objf_scales.size() > 0) + aux_objfs.InvScale(info.aux_objf_scales); + + if (info.tot_aux_objfs.IsZero()) { KALDI_LOG << "Overall log-probability for '" << name << "' is " << like << " per frame" @@ -198,7 +327,8 @@ bool NnetChainComputeProb::PrintTotalStats() const { } else { KALDI_LOG << "Overall log-probability for '" << name << "' is " - << like << " + " << l2_term << " = " << tot_objf << " per frame" + << like << " + " << aux_objfs.Str() + << " = " << tot_objf << " per frame" << ", over " << info.tot_weight << " frames."; } if (info.tot_weight > 0) @@ -224,7 +354,7 @@ double NnetChainComputeProb::GetTotalObjective(double *total_weight) const { unordered_map::const_iterator iter = objf_info_.begin(), end = objf_info_.end(); for (; iter != end; ++iter) { - tot_objectives += iter->second.tot_like + iter->second.tot_l2_term; + tot_objectives += iter->second.tot_like + iter->second.tot_aux_objfs.Sum(); tot_weight += iter->second.tot_weight; } @@ -270,6 +400,32 @@ void RecomputeStats(const std::vector &egs, KALDI_LOG << "Done recomputing stats."; } +/* +void RecomputeStats(const std::vector &egs, + const chain::ChainTrainingOptions &chain_config_in, + const fst::StdVectorFst &den_fst, + Nnet *nnet) { + KALDI_LOG << "Recomputing stats on nnet (affects batch-norm)"; + chain::ChainTrainingOptions chain_config(chain_config_in); + if (HasXentOutputs(*nnet) && + chain_config.xent_regularize == 0) { + // this forces it to compute the output for xent outputs, + // usually 'output-xent', which + // means that we'll be computing batch-norm stats for any + // components in that branch that have batch-norm. + chain_config.xent_regularize = 0.1; + } + + ZeroComponentStats(nnet); + NnetComputeProbOptions nnet_config; + nnet_config.store_component_stats = true; + NnetChainComputeProb prob_computer(nnet_config, chain_config, den_fst, nnet); + for (size_t i = 0; i < egs.size(); i++) + prob_computer.Compute(egs[i]); + prob_computer.PrintTotalStats(); + KALDI_LOG << "Done recomputing stats."; +} +*/ } // namespace nnet3 diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h index 49fc5c8f4d8..1102c22ea88 100644 --- a/src/nnet3/nnet-chain-diagnostics.h +++ b/src/nnet3/nnet-chain-diagnostics.h @@ -36,10 +36,18 @@ namespace nnet3 { struct ChainObjectiveInfo { double tot_weight; double tot_like; - double tot_l2_term; + BaseFloat objf_scale; + std::vector aux_objf_scales; + + ObjectiveValues tot_aux_objfs; ChainObjectiveInfo(): tot_weight(0.0), tot_like(0.0), - tot_l2_term(0.0) { } + objf_scale(1.0) { } + + ChainObjectiveInfo(BaseFloat objf_scale, + const std::vector &aux_objf_scales): + tot_weight(0.0), tot_like(0.0), + objf_scale(objf_scale), aux_objf_scales(aux_objf_scales) { } }; @@ -69,6 +77,7 @@ class NnetChainComputeProb { const fst::StdVectorFst &den_fst, Nnet *nnet); + void ParseObjectiveOpts(const chain::ChainTrainingOptions &chain_config); // Reset the likelihood stats, and the derivative stats (if computed). void Reset(); @@ -76,6 +85,9 @@ class NnetChainComputeProb { // compute objective on one minibatch. void Compute(const NnetChainExample &chain_eg); + // compute objective on one minibatch. + // void Compute(const NnetExample &eg); + // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; @@ -97,6 +109,9 @@ class NnetChainComputeProb { void ProcessOutputs(const NnetChainExample &chain_eg, NnetComputer *computer); + // void ProcessOutputs(const NnetExample &chain_eg, + // NnetComputer *computer); + NnetComputeProbOptions nnet_config_; chain::ChainTrainingOptions chain_config_; chain::DenominatorGraph den_graph_; @@ -108,6 +123,12 @@ class NnetChainComputeProb { unordered_map objf_info_; + CuArray sil_indices_; + + unordered_map smbr_factors_; + unordered_map mmi_factors_; + unordered_map ml_factors_; + unordered_map kl_factors_; }; /// This function zeros the stored component-level stats in the nnet using @@ -119,6 +140,10 @@ void RecomputeStats(const std::vector &egs, const fst::StdVectorFst &den_fst, Nnet *nnet); +//void RecomputeStats(const std::vector &egs, +// const chain::ChainTrainingOptions &chain_config, +// const fst::StdVectorFst &den_fst, +// Nnet *nnet); } // namespace nnet3 diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index c627bb1032a..9e7a885f4bb 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -87,6 +87,8 @@ void NnetChainSupervision::CheckDim() const { KALDI_ASSERT(deriv_weights.Dim() == indexes.size()); KALDI_ASSERT(deriv_weights.Min() >= 0.0); } + if (supervision.numerator_post_targets.NumRows() > 0) + KALDI_ASSERT(indexes.size() == supervision.numerator_post_targets.NumRows()); } NnetChainSupervision::NnetChainSupervision(const NnetChainSupervision &other): @@ -209,8 +211,13 @@ static void MergeSupervision( chain::Supervision output_supervision; MergeSupervision(input_supervision, &output_supervision); + if (output_supervision.numerator_post_targets.NumRows() > 0) + KALDI_ASSERT(output_supervision.frames_per_sequence * output_supervision.num_sequences == output_supervision.numerator_post_targets.NumRows()); output->supervision.Swap(&output_supervision); + if (output->supervision.numerator_post_targets.NumRows() > 0) + KALDI_ASSERT(output->supervision.frames_per_sequence * output->supervision.num_sequences == output->supervision.numerator_post_targets.NumRows()); + output->indexes.clear(); output->indexes.reserve(num_indexes); for (int32 n = 0; n < num_inputs; n++) { @@ -287,6 +294,28 @@ void MergeChainExamples(bool compress, } } +void TruncateDerivWeights(int32 truncate, + NnetChainExample *eg) { + for (size_t i = 0; i < eg->outputs.size(); i++) { + NnetChainSupervision &supervision = eg->outputs[i]; + Vector &deriv_weights = supervision.deriv_weights; + if (deriv_weights.Dim() == 0) { + deriv_weights.Resize(supervision.indexes.size()); + deriv_weights.Set(1.0); + } + int32 num_sequences = supervision.supervision.num_sequences, + frames_per_sequence = supervision.supervision.frames_per_sequence; + KALDI_ASSERT(2 * truncate < frames_per_sequence); + for (int32 t = 0; t < truncate; t++) + for (int32 s = 0; s < num_sequences; s++) + deriv_weights(t * num_sequences + s) = 0.0; + for (int32 t = frames_per_sequence - truncate; + t < frames_per_sequence; t++) + for (int32 s = 0; s < num_sequences; s++) + deriv_weights(t * num_sequences + s) = 0.0; + } +} + void GetChainComputationRequest(const Nnet &nnet, const NnetChainExample &eg, bool need_model_derivative, @@ -550,7 +579,5 @@ void ChainExampleMerger::Finish() { stats_.PrintStats(); } - - } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h index 187bb4ef3a3..a2068fba7ea 100644 --- a/src/nnet3/nnet-chain-example.h +++ b/src/nnet3/nnet-chain-example.h @@ -193,6 +193,15 @@ void ShiftChainExampleTimes(int32 frame_shift, const std::vector &exclude_names, NnetChainExample *eg); +/** + This sets to zero any elements of 'egs->outputs[*].deriv_weights' that correspond + to frames within the first or last 'truncate' frames of the sequence (e.g. you could + set 'truncate=5' to set zero deriv-weight for the first and last 5 frames of the + sequence). + */ +void TruncateDerivWeights(int32 truncate, + NnetChainExample *eg); + /** This function takes a NnetChainExample and produces a ComputationRequest. Assumes you don't want the derivatives w.r.t. the inputs; if you do, you can create the ComputationRequest manually. Assumes that if @@ -273,8 +282,6 @@ class ChainExampleMerger { MapType eg_to_egs_; }; - - } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 2ec2699ec97..53c926148a3 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -24,6 +24,7 @@ namespace kaldi { namespace nnet3 { + NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, const fst::StdVectorFst &den_fst, Nnet *nnet): @@ -56,8 +57,62 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, "Probably this is the first training iteration."; } } -} + if (opts.chain_config.use_smbr_objective && + (opts.chain_config.exclude_silence || opts.chain_config.one_silence_class)) { + if (opts.chain_config.silence_pdfs_str.empty()) { + KALDI_ERR << "--silence-pdfs is required if --exclude-silence or " + << "--one-silence-class is true."; + } + + std::vector silence_pdfs; + SplitStringToVector(opts.chain_config.silence_pdfs_str, ":,", false, + &silence_pdfs); + + int32 num_pdfs = nnet->OutputDim("output"); + std::vector indices(num_pdfs, -1); + + if (opts.chain_config.exclude_silence) { + for (size_t i = 0; i < num_pdfs; i++) { + indices[i] = i; + } + + for (std::vector::iterator it = silence_pdfs.begin(); + it != silence_pdfs.end(); ++it) { + int32 pdf = std::atoi(it->c_str()); + if (pdf > num_pdfs) + KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " + << opts.chain_config.silence_pdfs_str; + indices[pdf] = -1; + } + } else { + for (std::vector::iterator it = silence_pdfs.begin(); + it != silence_pdfs.end(); ++it) { + int32 pdf = std::atoi(it->c_str()); + if (pdf > num_pdfs) + KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " + << opts.chain_config.silence_pdfs_str; + indices[pdf] = pdf; + } + } + + sil_indices_.Resize(num_pdfs); + sil_indices_.CopyFromVec(indices); + } + + if (!opts.chain_config.smbr_factors_str.empty()) + ParseObjectiveScales(opts.chain_config.smbr_factors_str, + &smbr_factors_); + if (!opts.chain_config.mmi_factors_str.empty()) + ParseObjectiveScales(opts.chain_config.mmi_factors_str, + &mmi_factors_); + if (!opts.chain_config.ml_factors_str.empty()) + ParseObjectiveScales(opts.chain_config.ml_factors_str, + &ml_factors_); + if (!opts.chain_config.kl_factors_str.empty()) + ParseObjectiveScales(opts.chain_config.kl_factors_str, + &kl_factors_); +} void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { bool need_model_derivative = true; @@ -99,18 +154,21 @@ class ChainTrainerMemoryHolder { public: ChainTrainerMemoryHolder(const Nnet &nnet, int32 num_den_graph_states, - const NnetChainExample &eg); + const NnetChainExample &eg, + bool use_smbr_objective = false); private: CuMatrix nnet_output_deriv_; CuMatrix xent_output_deriv_; CuMatrix beta_; CuMatrix alpha_; - + CuMatrix beta_smbr_; + CuMatrix alpha_smbr_; }; ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const Nnet &nnet, int32 den_graph_states, - const NnetChainExample &eg) { + const NnetChainExample &eg, + bool use_smbr_objective) { std::vector::const_iterator iter = eg.outputs.begin(), end = eg.outputs.end(); @@ -147,6 +205,72 @@ ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const Nnet &nnet, } } + // the sequence of resizes is in a specific order (bigger to smaller) + // so that the cudaMalloc won't trash the memory it has already + // alloc'd in the previous iterations + alpha_.Resize(max_frames_per_sequence, + max_sequence_size, + kUndefined); + + nnet_output_deriv_.Resize(max_rows, max_cols, kUndefined); + // note: the same block of memory can be used for xent_output_deriv_ as is + // used for exp_nnet_output_transposed_ in chain-training.cc. + xent_output_deriv_.Resize(max_rows, max_cols, + kUndefined, kStrideEqualNumCols); + + beta_.Resize(2, max_sequence_size, kUndefined); + + if (use_smbr_objective) { + alpha_smbr_.Resize(max_frames_per_sequence, + max_sequence_size, + kUndefined); + beta_smbr_.Resize(2, max_sequence_size, kUndefined); + } +} + +/* +ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const Nnet &nnet, + int32 den_graph_states, + const NnetExample &eg) { + + std::vector::const_iterator iter = eg.io.begin(), + end = eg.io.end(); + + int32 max_rows = 0, + max_cols = 0; + + size_t max_frames_per_sequence = 0, + max_sequence_size = 0, + max_alpha_matrix_size = 0; + + for (; iter != end; ++iter) { + const NnetIo &io = *iter; + int32 node_index = nnet.GetNodeIndex(io.name); + KALDI_ASSERT(node_index >= 0); + if (!nnet.IsOutputNode(node_index)) continue; + + int32 output_rows = io.features.NumRows(); + int32 output_cols = nnet.OutputDim("output"); + + int32 num_sequences = NumSequencesInChainEg(io.indexes); + size_t curr_frames_per_sequence = output_rows / num_sequences + 1; + size_t den_graph_size = den_graph_states + 1; + size_t curr_sequence_size = den_graph_size * num_sequences; + size_t curr_alpha_matrix_size = curr_frames_per_sequence * curr_sequence_size; + + if (curr_alpha_matrix_size > max_alpha_matrix_size) { + max_alpha_matrix_size = curr_alpha_matrix_size; + max_frames_per_sequence = curr_frames_per_sequence; + max_sequence_size = curr_sequence_size; + } + + size_t matrix_size = output_rows * output_cols; + if (matrix_size > (max_rows * max_cols)) { + max_rows = output_rows; + max_cols = output_cols; + } + } + // the sequence of resizes is in a specific order (bigger to smaller) // so that the cudaMalloc won't trash the memory it has already // alloc'd in the previous iterations @@ -164,7 +288,7 @@ ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const Nnet &nnet, beta_.Resize(2, max_sequence_size, kUndefined); } -void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, +void NnetChainTrainer::TrainInternal(const NnetExample &eg, const NnetComputation &computation) { const NnetTrainerOptions &nnet_config = opts_.nnet_config; // note: because we give the 1st arg (nnet_) as a pointer to the @@ -178,6 +302,60 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, ChainTrainerMemoryHolder *memory_holder = new ChainTrainerMemoryHolder(*nnet_, den_graph_.NumStates(), eg); + // give the inputs to the computer object + computer.AcceptInputs(*nnet_, eg.io); + computer.Run(); + + // 'this->ProcessOutputs()' is going to need the same sizes as are stored in + // 'memory_holder'. + delete memory_holder; + + this->ProcessOutputs(false, eg, &computer); + computer.Run(); + + // If relevant, add in the part of the gradient that comes from L2 + // regularization. + ApplyL2Regularization(*nnet_, + GetNumNvalues(eg.io, false) * + nnet_config.l2_regularize_factor, + delta_nnet_); + + // Updates the parameters of nnet + bool success = UpdateNnetWithMaxChange(*delta_nnet_, + nnet_config.max_param_change, 1.0, 1.0 - nnet_config.momentum, nnet_, + &num_max_change_per_component_applied_, &num_max_change_global_applied_); + + // Scale down the batchnorm stats (keeps them fresh... this affects what + // happens when we use the model with batchnorm test-mode set). + ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + + // The following will only do something if we have a LinearComponent + // or AffineComponent with orthonormal-constraint set to a nonzero value. + ConstrainOrthonormal(nnet_); + + // Scale delta_nnet + if (success) + ScaleNnet(nnet_config.momentum, delta_nnet_); + else + ScaleNnet(0.0, delta_nnet_); +} +*/ + +void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, + const NnetComputation &computation) { + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + // note: because we give the 1st arg (nnet_) as a pointer to the + // constructor of 'computer', it will use that copy of the nnet to + // store stats. + NnetComputer computer(nnet_config.compute_config, computation, + nnet_, delta_nnet_); + + // reserve the memory needed in ProcessOutputs (before memory gets fragmented + // by the call to computer.Run(). + ChainTrainerMemoryHolder *memory_holder = + new ChainTrainerMemoryHolder(*nnet_, den_graph_.NumStates(), eg, + opts_.chain_config.use_smbr_objective); + // give the inputs to the computer object. computer.AcceptInputs(*nnet_, eg.inputs); computer.Run(); @@ -299,17 +477,74 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, nnet_output.NumCols(), kUndefined); - bool use_xent = (opts_.chain_config.xent_regularize != 0.0); + chain::ChainTrainingOptions chain_config(opts_.chain_config); + + { + auto it = smbr_factors_.find(sup.name); + if (it != smbr_factors_.end()) + chain_config.smbr_factor = it->second; + + if (chain_config.smbr_factor > 0.0 && !chain_config.use_smbr_objective) + KALDI_ERR << "smbr factor for " << sup.name << " = " + << chain_config.smbr_factor + << " > 0.0, but --use-smbr-objective=false"; + } + { + auto it = mmi_factors_.find(sup.name); + if (it != mmi_factors_.end()) + chain_config.mmi_factor = it->second; + } + { + auto it = ml_factors_.find(sup.name); + if (it != ml_factors_.end()) + chain_config.ml_factor = it->second; + } + { + auto it = kl_factors_.find(sup.name); + if (it != kl_factors_.end()) + chain_config.kl_factor = it->second; + } + + bool use_xent = (chain_config.xent_regularize != 0.0); std::string xent_name = sup.name + "-xent"; // typically "output-xent". CuMatrix xent_deriv; - BaseFloat tot_objf, tot_l2_term, tot_weight; + BaseFloat tot_objf, tot_mmi_objf, tot_l2_term, tot_weight; - ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, - sup.supervision, nnet_output, - &tot_objf, &tot_l2_term, &tot_weight, - &nnet_output_deriv, - (use_xent ? &xent_deriv : NULL)); + if (chain_config.kl_factor > 0.0) { + KALDI_ASSERT(chain_config.smbr_factor == 0.0); + if (!chain_config.self_kl) + KALDI_ASSERT(sup.supervision.numerator_post_targets.NumRows() > 0); + } + + if (chain_config.smbr_factor > 0.0) { + ComputeChainSmbrObjfAndDeriv(chain_config, den_graph_, + sup.supervision, nnet_output, + &tot_objf, &tot_mmi_objf, + &tot_l2_term, &tot_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL), + sil_indices_.Dim() ? &sil_indices_ : NULL); + } else { + ComputeChainObjfAndDeriv(chain_config, den_graph_, + sup.supervision, nnet_output, + &tot_objf, &tot_l2_term, &tot_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL)); + + if (chain_config.self_kl) { + const CuMatrixBase &teacher_nnet_output = + computer->GetOutput(sup.name + "-teacher"); + + BaseFloat num_objf = 0, num_weight = 0.0; + ComputeChainDenominatorObjfAndDeriv( + chain_config, den_graph_, teacher_nnet_output, + sup.supervision.weight, sup.supervision.num_sequences, + &num_objf, &num_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL)); + } + } if (use_xent) { // this block computes the cross-entropy objective. @@ -318,6 +553,7 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, // at this point, xent_deriv is posteriors derived from the numerator // computation. note, xent_objf has a factor of '.supervision.weight' BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); + objf_info_[xent_name + suffix].UpdateStats(xent_name + suffix, opts_.nnet_config.print_interval, num_minibatches_processed_, @@ -331,15 +567,51 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, xent_deriv.MulRowsVec(cu_deriv_weights); } - computer->AcceptInput(sup.name, &nnet_output_deriv); + std::vector objective_values; + objective_values.push_back(tot_l2_term); + if (chain_config.smbr_factor > 0.0) + objective_values.push_back(tot_mmi_objf); + + { + unordered_map::iterator it + = objf_info_.find(sup.name + suffix); + + if (it == objf_info_.end()) { + BaseFloat this_objf_scale = 1.0; + std::vector aux_objf_scales(1, 1.0); // l2_term + if (chain_config.smbr_factor > 0.0) { + this_objf_scale *= chain_config.smbr_factor; + aux_objf_scales.push_back( + (chain_config.mmi_factor + chain_config.ml_factor)); + } + + ObjectiveFunctionInfo totals(this_objf_scale, aux_objf_scales); + it = objf_info_.insert(it, std::make_pair(sup.name + suffix, totals)); + } + + if (opts_.accumulate_avg_deriv && + it->second.deriv_sum.Dim() == 0) + it->second.deriv_sum.Resize(nnet_output.NumCols()); + + if (it->second.deriv_sum.Dim() > 0) + it->second.deriv_sum.AddRowSumMat(1.0, nnet_output_deriv, 1.0); + + it->second.UpdateStats(sup.name + suffix, + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, tot_objf, objective_values); + } - objf_info_[sup.name + suffix].UpdateStats(sup.name + suffix, - opts_.nnet_config.print_interval, - num_minibatches_processed_, - tot_weight, tot_objf, tot_l2_term); + computer->AcceptInput(sup.name, &nnet_output_deriv); if (use_xent) { - xent_deriv.Scale(opts_.chain_config.xent_regularize); + xent_deriv.Scale(chain_config.xent_regularize); + if (opts_.accumulate_avg_deriv && + objf_info_[xent_name + suffix].deriv_sum.Dim() == 0) + objf_info_[xent_name + suffix].deriv_sum.Resize(nnet_output.NumCols()); + if (objf_info_[xent_name + suffix].deriv_sum.Dim() > 0) + objf_info_[xent_name + suffix].deriv_sum.AddRowSumMat( + 1.0, xent_deriv, 1.0); computer->AcceptInput(xent_name, &xent_deriv); } } @@ -353,6 +625,7 @@ bool NnetChainTrainer::PrintTotalStats() const { for (; iter != end; ++iter) { const std::string &name = iter->first; const ObjectiveFunctionInfo &info = iter->second; + ans = info.PrintTotalStats(name) || ans; } PrintMaxChangeStats(); diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h index 5bf6a3f6fce..63febb6d719 100644 --- a/src/nnet3/nnet-chain-training.h +++ b/src/nnet3/nnet-chain-training.h @@ -36,7 +36,8 @@ struct NnetChainTrainingOptions { NnetTrainerOptions nnet_config; chain::ChainTrainingOptions chain_config; bool apply_deriv_weights; - NnetChainTrainingOptions(): apply_deriv_weights(true) { } + bool accumulate_avg_deriv; + NnetChainTrainingOptions(): apply_deriv_weights(true), accumulate_avg_deriv(true) { } void Register(OptionsItf *opts) { nnet_config.Register(opts); @@ -44,6 +45,9 @@ struct NnetChainTrainingOptions { opts->Register("apply-deriv-weights", &apply_deriv_weights, "If true, apply the per-frame derivative weights stored with " "the example"); + opts->Register("accumulate-avg-deriv", &accumulate_avg_deriv, + "If true, the average derivative will be accumulated and " + "printed"); } }; @@ -61,6 +65,9 @@ class NnetChainTrainer { // train on one minibatch. void Train(const NnetChainExample &eg); + // train on one minibatch using NnetExample + // void Train(const NnetExample &eg); + // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; @@ -74,6 +81,10 @@ class NnetChainTrainer { void TrainInternal(const NnetChainExample &eg, const NnetComputation &computation); + // The internal function for doing one step of conventional SGD training. + // void TrainInternal(const NnetExample &eg, + // const NnetComputation &computation); + // The internal function for doing one step of backstitch training. Depending // on whether is_backstitch_step1 is true, It could be either the first // (backward) step, or the second (forward) step of backstitch. @@ -84,6 +95,9 @@ class NnetChainTrainer { void ProcessOutputs(bool is_backstitch_step2, const NnetChainExample &eg, NnetComputer *computer); + // void ProcessOutputs(bool is_backstitch_step2, const NnetExample &eg, + // NnetComputer *computer); + const NnetChainTrainingOptions opts_; chain::DenominatorGraph den_graph_; @@ -110,6 +124,13 @@ class NnetChainTrainer { // consistent dropout masks. It's set to a value derived from rand() // when the class is initialized. int32 srand_seed_; + + CuArray sil_indices_; + + unordered_map smbr_factors_; + unordered_map mmi_factors_; + unordered_map ml_factors_; + unordered_map kl_factors_; }; diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index 2c76805f5cc..e4f89d7f42b 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -416,12 +416,14 @@ std::string NonlinearComponent::Info() const { Vector value_avg(value_avg_dbl); value_avg.Scale(1.0 / count_); stream << ", value-avg=" << SummarizeVector(value_avg); + if (deriv_sum_.Dim() == dim_) { Vector deriv_avg(deriv_sum_); deriv_avg.Scale(1.0 / count_); stream << ", deriv-avg=" << SummarizeVector(deriv_avg); } } + if (oderiv_count_ > 0 && oderiv_sumsq_.Dim() == dim_) { Vector oderiv_rms(oderiv_sumsq_); oderiv_rms.Scale(1.0 / oderiv_count_); diff --git a/src/nnet3/nnet-diagnostics.cc b/src/nnet3/nnet-diagnostics.cc index ca6124a212f..f9eabbc9557 100644 --- a/src/nnet3/nnet-diagnostics.cc +++ b/src/nnet3/nnet-diagnostics.cc @@ -125,6 +125,7 @@ void NnetComputeProb::ProcessOutputs(const NnetExample &eg, if (config_.compute_accuracy) { BaseFloat tot_weight, tot_accuracy; PerDimObjectiveInfo &acc_totals = accuracy_info_[io.name]; + Vector tot_weight_vec, tot_objective_vec; if (config_.compute_per_dim_accuracy && acc_totals.tot_objective_vec.Dim() == 0) { @@ -132,14 +133,24 @@ void NnetComputeProb::ProcessOutputs(const NnetExample &eg, acc_totals.tot_weight_vec.Resize(output.NumCols()); } + if (config_.compute_per_dim_accuracy) { + tot_objective_vec.Resize(output.NumCols()); + tot_weight_vec.Resize(output.NumCols()); + } + ComputeAccuracy(io.features, output, &tot_weight, &tot_accuracy, - config_.compute_per_dim_accuracy ? - &acc_totals.tot_weight_vec : NULL, - config_.compute_per_dim_accuracy ? - &acc_totals.tot_objective_vec : NULL); + config_.compute_per_dim_accuracy ? + &tot_weight_vec : NULL, + config_.compute_per_dim_accuracy ? + &tot_objective_vec : NULL); acc_totals.tot_weight += tot_weight; acc_totals.tot_objective += tot_accuracy; + + if (config_.compute_per_dim_accuracy) { + acc_totals.tot_objective_vec.AddVec(1.0, tot_objective_vec); + acc_totals.tot_weight_vec.AddVec(1.0, tot_weight_vec); + } } } } diff --git a/src/nnet3/nnet-diagnostics.h b/src/nnet3/nnet-diagnostics.h index 61e2ed18e1d..f0c905a3c3e 100644 --- a/src/nnet3/nnet-diagnostics.h +++ b/src/nnet3/nnet-diagnostics.h @@ -51,7 +51,6 @@ struct PerDimObjectiveInfo: public SimpleObjectiveInfo { Vector tot_objective_vec; }; - struct NnetComputeProbOptions { bool debug_computation; bool compute_deriv; @@ -63,6 +62,7 @@ struct NnetComputeProbOptions { bool store_component_stats; bool compute_per_dim_accuracy; + std::string objective_scales_str; NnetOptimizeOptions optimize_config; NnetComputeOptions compute_config; @@ -84,6 +84,10 @@ struct NnetComputeProbOptions { "accuracy values as well as objective functions"); opts->Register("compute-per-dim-accuracy", &compute_per_dim_accuracy, "If true, compute accuracy values per-dim"); + opts->Register("objective-scales", &objective_scales_str, + "Objective scales for the outputs specified as " + "a comma-separated list of pairs " + ":,:..."); // register the optimization options with the prefix "optimization". ParseOptions optimization_opts("optimization", opts); diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index cc5fe3cc050..1f5bb5397db 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -89,7 +89,8 @@ static void MergeIo(const std::vector &src, const std::vector &names, const std::vector &sizes, bool compress, - NnetExample *merged_eg) { + NnetExample *merged_eg, + bool sort_by_t) { // The total number of Indexes we have across all examples. int32 num_feats = names.size(); @@ -98,6 +99,8 @@ static void MergeIo(const std::vector &src, // The features in the different NnetIo in the Indexes across all examples std::vector > output_lists(num_feats); + std::vector const*> > deriv_weights_lists(num_feats); + // Initialize the merged_eg merged_eg->io.clear(); merged_eg->io.resize(num_feats); @@ -130,6 +133,14 @@ static void MergeIo(const std::vector &src, // Add f'th Io's features output_lists[f].push_back(&(io.features)); + if (io.deriv_weights.Dim() != 0 && + merged_eg->io[f].deriv_weights.Dim() == 0) { + merged_eg->io[f].deriv_weights.Resize(sizes[f], kUndefined); + } + + if (merged_eg->io[f].deriv_weights.Dim() != 0) + deriv_weights_lists[f].push_back(&(io.deriv_weights)); + // Work on the Indexes for the f^th Io in merged_eg NnetIo &output_io = merged_eg->io[f]; std::copy(io.indexes.begin(), io.indexes.end(), @@ -143,16 +154,56 @@ static void MergeIo(const std::vector &src, "Merging already-merged egs? Not currentlysupported."); output_iter[i].n = n; } + this_offset += this_size; // note: this_offset is a reference. } } + KALDI_ASSERT(cur_size == sizes); for (int32 f = 0; f < num_feats; f++) { + NnetIo &output_io = merged_eg->io[f]; + AppendGeneralMatrixRows(output_lists[f], - &(merged_eg->io[f].features)); + &(output_io.features), + output_io.name == "output" ? sort_by_t : false); + if (compress) { // the following won't do anything if the features were sparse. - merged_eg->io[f].features.Compress(); + output_io.features.Compress(); + } + + if (output_io.name != "output") continue; + + if (sort_by_t) + std::sort(output_io.indexes.begin(), output_io.indexes.end()); + + if (output_io.deriv_weights.Dim() != 0) { + // merge the deriv_weights. + int32 num_inputs = deriv_weights_lists[f].size(); + KALDI_ASSERT(num_inputs > 0 + && deriv_weights_lists[f][0]->Dim() != 0); + int32 frames_per_sequence = deriv_weights_lists[f][0]->Dim(); + + if (output_io.deriv_weights.Dim() != frames_per_sequence * num_inputs) + KALDI_ERR << output_io.deriv_weights.Dim() + << " != " << frames_per_sequence << " * " << num_inputs; + + for (int32 n = 0; n < num_inputs; n++) { + const Vector &src_deriv_weights = *(deriv_weights_lists[f][n]); + KALDI_ASSERT(src_deriv_weights.Dim() == frames_per_sequence); + + if (sort_by_t) { + // the ordering of the deriv_weights corresponds to the ordering of the + // Indexes, where the time dimension has the greater stride. + for (int32 t = 0; t < frames_per_sequence; t++) { + output_io.deriv_weights(t * num_inputs + n) = src_deriv_weights(t); + } + } else { + for (int32 t = 0; t < frames_per_sequence; t++) { + output_io.deriv_weights(t + n * num_inputs) = src_deriv_weights(t); + } + } + } } } } @@ -161,14 +212,15 @@ static void MergeIo(const std::vector &src, void MergeExamples(const std::vector &src, bool compress, - NnetExample *merged_eg) { + NnetExample *merged_eg, + bool sort_by_t) { KALDI_ASSERT(!src.empty()); std::vector io_names; GetIoNames(src, &io_names); // the sizes are the total number of Indexes we have across all examples. std::vector io_sizes; GetIoSizes(src, io_names, &io_sizes); - MergeIo(src, io_names, io_sizes, compress, merged_eg); + MergeIo(src, io_names, io_sizes, compress, merged_eg, sort_by_t); } void ShiftExampleTimes(int32 t_offset, @@ -199,15 +251,18 @@ void ShiftExampleTimes(int32 t_offset, } } + void GetComputationRequest(const Nnet &nnet, const NnetExample &eg, bool need_model_derivative, bool store_component_stats, - ComputationRequest *request) { + ComputationRequest *request, + bool use_xent_regularization, + bool use_xent_derivative) { request->inputs.clear(); request->inputs.reserve(eg.io.size()); request->outputs.clear(); - request->outputs.reserve(eg.io.size()); + request->outputs.reserve((use_xent_regularization ? 2 : 1) * eg.io.size()); request->need_model_derivative = need_model_derivative; request->store_component_stats = store_component_stats; for (size_t i = 0; i < eg.io.size(); i++) { @@ -226,6 +281,18 @@ void GetComputationRequest(const Nnet &nnet, io_spec.name = name; io_spec.indexes = io.indexes; io_spec.has_deriv = nnet.IsOutputNode(node_index) && need_model_derivative; + if (use_xent_regularization && nnet.IsOutputNode(node_index)) { + size_t cur_size = request->outputs.size(); + request->outputs.resize(cur_size + 1); + IoSpecification &io_spec = request->outputs[cur_size - 1], + &io_spec_xent = request->outputs[cur_size]; + // the IoSpecification for the -xent output is the same + // as for the regular output, except for its name which has + // the -xent suffix (and the has_deriv member may differ). + io_spec_xent = io_spec; + io_spec_xent.name = name + "-xent"; + io_spec_xent.has_deriv = use_xent_derivative; + } } // check to see if something went wrong. if (request->inputs.empty()) @@ -822,6 +889,43 @@ void UtteranceSplitter::GetGapSizes(int32 utterance_length, void UtteranceSplitter::GetChunksForUtterance( int32 utterance_length, std::vector *chunk_info) { + + if (config_.no_chunking) { + int32 min_diff = 100; + int32 len_extend_context = 0; + + for (std::vector::const_iterator it = config_.num_frames.begin(); + it != config_.num_frames.end(); ++it) { + if (abs(utterance_length - *it) < abs(min_diff)) + min_diff = utterance_length - *it; + } + + if (min_diff != 0) { + KALDI_WARN << "No exact match found for the length " << utterance_length + << " closest allowed length is off by " << min_diff + << " frames. Will try to fix it.."; + + if (abs(min_diff) < 5) // we assume possibly up to 5 frames from the end can be safely deleted + len_extend_context = -min_diff; // let the code below do it + else // unexpected + KALDI_ERR << "Too much length difference " << min_diff; + } + + chunk_info->resize(1); + ChunkTimeInfo &info = (*chunk_info)[0]; + + info.first_frame = 0; + info.num_frames = utterance_length + len_extend_context; + info.left_context = (config_.left_context_initial >= 0 ? + config_.left_context_initial : config_.left_context); + info.right_context = (config_.right_context_final >= 0 ? + config_.right_context_final : config_.right_context); + + SetOutputWeights(utterance_length, chunk_info); + AccStatsForUtterance(utterance_length, *chunk_info); + return; + } + int32 t = 0; if (config_.num_frames_str == "-1" ) { ChunkTimeInfo *info; @@ -1230,7 +1334,7 @@ void ExampleMerger::WriteMinibatch(const std::vector &egs) { int32 minibatch_size = egs.size(); stats_.WroteExample(eg_size, structure_hash, minibatch_size); NnetExample merged_eg; - MergeExamples(egs, config_.compress, &merged_eg); + MergeExamples(egs, config_.compress, &merged_eg, config_.sort_by_t); std::ostringstream key; key << "merged-" << (num_egs_written_++) << "-" << minibatch_size; writer_->Write(key.str(), merged_eg); diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h index 52b2ebbf904..ba3a99b4245 100644 --- a/src/nnet3/nnet-example-utils.h +++ b/src/nnet3/nnet-example-utils.h @@ -33,10 +33,14 @@ namespace nnet3 { /** Merge a set of input examples into a single example (typically the size of "src" will be the minibatch size). Will crash if "src" is the empty vector. If "compress" is true, it will compress any non-sparse features in the output. + + If sort_by_t is true, the examples and indexes for output are sorted first + by 't' and then by 'n' index. */ void MergeExamples(const std::vector &src, bool compress, - NnetExample *dest); + NnetExample *dest, + bool sort_by_t = false); /** Shifts the time-index t of everything in the "eg" by adding "t_offset" to @@ -56,13 +60,22 @@ void ShiftExampleTimes(int32 t_offset, inputs; if you do, you can create/modify the ComputationRequest manually. Assumes that if need_model_derivative is true, you will be supplying derivatives w.r.t. all outputs. + + If use_xent_regularization == true, then it assumes that for each output + name (e.g. "output" in the eg, there is another output with the same + dimension and with the suffix "-xent" on its name, e.g. named + "output-xent". The derivative w.r.t. the xent objective will only be + supplied to the nnet computation if 'use_xent_derivative' is true (we + propagate back the xent derivative to the model only in training, not in + model-combination in nnet3-chain-combine). */ void GetComputationRequest(const Nnet &nnet, const NnetExample &eg, bool need_model_derivative, bool store_component_stats, - ComputationRequest *computation_request); - + ComputationRequest *computation_request, + bool use_xent_regularization = false, + bool use_xent_derivative = false); // Writes as unsigned char a vector 'vec' that is required to have // values between 0 and 1. @@ -87,6 +100,7 @@ struct ExampleGenerationConfig { int32 num_frames_overlap; int32 frame_subsampling_factor; std::string num_frames_str; + bool no_chunking; // The following parameters are derived parameters, computed by @@ -101,7 +115,7 @@ struct ExampleGenerationConfig { left_context(0), right_context(0), left_context_initial(-1), right_context_final(-1), num_frames_overlap(0), frame_subsampling_factor(1), - num_frames_str("1") { } + num_frames_str("1"), no_chunking(false) { } /// This function decodes 'num_frames_str' into 'num_frames', and ensures that /// the members of 'num_frames' are multiples of 'frame_subsampling_factor'. @@ -141,6 +155,9 @@ struct ExampleGenerationConfig { po->Register("frame-subsampling-factor", &frame_subsampling_factor, "Used " "if the frame-rate of the output labels in the generated " "examples will be less than the frame-rate at the input"); + po->Register("no-chunking", &no_chunking, "If set to true, then the " + "whole utterance will be used and there will be no " + "chunking"); } }; @@ -325,12 +342,14 @@ class ExampleMergingConfig { std::string measure_output_frames; // for back-compatibility, not used. std::string minibatch_size; std::string discard_partial_minibatches; // for back-compatibility, not used. - + bool sort_by_t; // If true, the examples and indexes are sorted + // first by 't' and next by 'n'. ExampleMergingConfig(const char *default_minibatch_size = "256"): compress(false), measure_output_frames("deprecated"), minibatch_size(default_minibatch_size), - discard_partial_minibatches("deprecated") { } + discard_partial_minibatches("deprecated"), + sort_by_t(false) { } void Register(OptionsItf *po) { po->Register("compress", &compress, "If true, compress the output examples " @@ -354,6 +373,9 @@ class ExampleMergingConfig { "--minibatch-size=128=64:128,256/256=32:64,128. Egs are given " "minibatch-sizes based on the specified eg-size closest to " "their actual size."); + po->Register("sort-by-t", &sort_by_t, + "If true, the features in examples and indexes are sorted " + "first by 't' and next by 'n'."); } @@ -508,7 +530,6 @@ class ExampleMerger { const ExampleMergingConfig &config_; NnetExampleWriter *writer_; ExampleMergingStats stats_; - // Note: the "key" into the egs is the first element of the vector. typedef unordered_map, NnetExampleStructureHasher, diff --git a/src/nnet3/nnet-example.cc b/src/nnet3/nnet-example.cc index b79a547ffcf..048797be21a 100644 --- a/src/nnet3/nnet-example.cc +++ b/src/nnet3/nnet-example.cc @@ -31,6 +31,10 @@ void NnetIo::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, name); WriteIndexVector(os, binary, indexes); features.Write(os, binary); + if (deriv_weights.Dim() > 0) { + WriteToken(os, binary, ""); + deriv_weights.Write(os, binary); + } WriteToken(os, binary, ""); KALDI_ASSERT(static_cast(features.NumRows()) == indexes.size()); } @@ -40,7 +44,15 @@ void NnetIo::Read(std::istream &is, bool binary) { ReadToken(is, binary, &name); ReadIndexVector(is, binary, &indexes); features.Read(is, binary); - ExpectToken(is, binary, ""); + + std::string token; + ReadToken(is, binary, &token); + if (token != "") { + KALDI_ASSERT(token == ""); + deriv_weights.Read(is, binary); + ReadToken(is, binary, &token); + } + KALDI_ASSERT(token == ""); } bool NnetIo::operator == (const NnetIo &other) const { @@ -49,6 +61,8 @@ bool NnetIo::operator == (const NnetIo &other) const { if (features.NumRows() != other.features.NumRows() || features.NumCols() != other.features.NumCols()) return false; + if (deriv_weights.Dim() > 0 && + !deriv_weights.ApproxEqual(other.deriv_weights)) return false; Matrix this_mat, other_mat; features.GetMatrix(&this_mat); other.features.GetMatrix(&other_mat); @@ -81,6 +95,7 @@ void NnetIo::Swap(NnetIo *other) { name.swap(other->name); indexes.swap(other->indexes); features.Swap(&(other->features)); + deriv_weights.Swap(&(other->deriv_weights)); } NnetIo::NnetIo(const std::string &name, @@ -98,6 +113,22 @@ NnetIo::NnetIo(const std::string &name, indexes[i].t = t_begin + i * t_stride; } +NnetIo::NnetIo(const std::string &name, + int32 dim, + int32 t_begin, + const Posterior &labels, + const VectorBase &deriv_weights, + int32 t_stride): + name(name), deriv_weights(deriv_weights) { + int32 num_rows = labels.size(); + KALDI_ASSERT(num_rows > 0); + SparseMatrix sparse_feats(dim, labels); + features = sparse_feats; + indexes.resize(num_rows); // sets all n,t,x to zeros. + for (int32 i = 0; i < num_rows; i++) + indexes[i].t = t_begin + i * t_stride; + KALDI_ASSERT(num_rows == deriv_weights.Dim()); +} void NnetExample::Write(std::ostream &os, bool binary) const { diff --git a/src/nnet3/nnet-example.h b/src/nnet3/nnet-example.h index d7312d49729..703ab555bfa 100644 --- a/src/nnet3/nnet-example.h +++ b/src/nnet3/nnet-example.h @@ -45,6 +45,8 @@ struct NnetIo { /// a Matrix, or SparseMatrix (a SparseMatrix would be the natural format for posteriors). GeneralMatrix features; + Vector deriv_weights; + /// This constructor creates NnetIo with name "name", indexes with n=0, x=0, /// and t values ranging from t_begin to /// (t_begin + t_stride * feats.NumRows() - 1) with a stride t_stride, and @@ -73,6 +75,13 @@ struct NnetIo { const Posterior &labels, int32 t_stride = 1); + NnetIo(const std::string &name, + int32 dim, + int32 t_begin, + const Posterior &labels, + const VectorBase &deriv_weights, + int32 t_stride = 1); + void Swap(NnetIo *other); NnetIo() { } diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 69f8442a08a..e05cb101074 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -3590,6 +3590,15 @@ void LogSoftmaxComponent::Backprop(const std::string &debug_info, in_deriv->DiffLogSoftmaxPerRow(out_value, out_deriv); } +void LogSoftmaxComponent::StoreStats(const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + void *memo) { + // We don't store derivative stats for this component type, just activation + // stats. + CuMatrix out_exp(out_value); + out_exp.ApplyExp(); + StoreStatsInternal(out_exp, NULL); +} void FixedScaleComponent::Init(const CuVectorBase &scales) { KALDI_ASSERT(scales.Dim() != 0); diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 12ae99d716b..f19ccbb5c62 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -733,6 +733,9 @@ class LogSoftmaxComponent: public NonlinearComponent { void *memo, Component *to_update, CuMatrixBase *in_deriv) const; + virtual void StoreStats(const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + void *memo); virtual Component* Copy() const { return new LogSoftmaxComponent(*this); } private: diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 8fda24cd22d..7b29d22aa0c 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -266,13 +266,90 @@ void NnetTrainer::PrintMaxChangeStats() const { << " \% of the time."; } +ObjectiveValues::ObjectiveValues(const std::vector &values) { + for (std::vector::const_iterator it = values.begin(); + it != values.end(); ++it) { + objective_values.push_back(*it); + } +} + +void ObjectiveValues::Resize(int32 size) { + objective_values.clear(); + objective_values.resize(size); +} + +void ObjectiveValues::Add(const ObjectiveValues &other) { + if (Size() == 0) { + Resize(other.Size()); + } + + if (Size() != other.Size()) { + KALDI_ERR << "objective values must have same size."; + } + + for (size_t i = 0; i < Size(); i++) { + objective_values[i] += other.objective_values[i]; + } +} + +void ObjectiveValues::Scale(BaseFloat scale) { + for (std::vector::iterator it = objective_values.begin(); + it != objective_values.end(); ++it) + *it *= scale; +} + +void ObjectiveValues::InvScale(BaseFloat inv_scale) { + for (std::vector::iterator it = objective_values.begin(); + it != objective_values.end(); ++it) { + if (inv_scale != 0.0) + *it /= inv_scale; + else + KALDI_ASSERT(*it == 0.0); + } +} + +void ObjectiveValues::InvScale(const std::vector &inv_scales) { + KALDI_ASSERT(objective_values.size() == inv_scales.size()); + for (size_t i = 0; i < objective_values.size(); i++) { + if (inv_scales[i] != 0.0) + objective_values[i] /= inv_scales[i]; + else + KALDI_ASSERT(objective_values[i] == 0.0); + } +} + +bool ObjectiveValues::IsZero() const { + for (std::vector::const_iterator it = objective_values.begin(); + it != objective_values.end(); ++it) { + if (*it != 0.0) return false; + } + return true; +} + +double ObjectiveValues::Sum() const { + double sum = 0.0; + for (std::vector::const_iterator it = objective_values.begin(); + it != objective_values.end(); ++it) { + sum += *it; + } + return sum; +} + +std::string ObjectiveValues::Str() const { + std::ostringstream oss; + for (size_t i = 0; i < Size(); i++) { + oss << objective_values[i] << (i < Size() - 1 ? " + " : ""); + } + return oss.str(); +} + void ObjectiveFunctionInfo::UpdateStats( const std::string &output_name, int32 minibatches_per_phase, int32 minibatch_counter, BaseFloat this_minibatch_weight, BaseFloat this_minibatch_tot_objf, - BaseFloat this_minibatch_tot_aux_objf) { + const ObjectiveValues &this_minibatch_tot_aux_objfs) { int32 phase = minibatch_counter / minibatches_per_phase; if (phase != current_phase) { KALDI_ASSERT(phase > current_phase); @@ -281,16 +358,16 @@ void ObjectiveFunctionInfo::UpdateStats( current_phase = phase; tot_weight_this_phase = 0.0; tot_objf_this_phase = 0.0; - tot_aux_objf_this_phase = 0.0; + tot_aux_objfs_this_phase.Reset(); minibatches_this_phase = 0; } minibatches_this_phase++; tot_weight_this_phase += this_minibatch_weight; tot_objf_this_phase += this_minibatch_tot_objf; - tot_aux_objf_this_phase += this_minibatch_tot_aux_objf; + tot_aux_objfs_this_phase.Add(this_minibatch_tot_aux_objfs); tot_weight += this_minibatch_weight; tot_objf += this_minibatch_tot_objf; - tot_aux_objf += this_minibatch_tot_aux_objf; + tot_aux_objfs.Add(this_minibatch_tot_aux_objfs); } void ObjectiveFunctionInfo::PrintStatsForThisPhase( @@ -300,7 +377,7 @@ void ObjectiveFunctionInfo::PrintStatsForThisPhase( int32 start_minibatch = current_phase * minibatches_per_phase, end_minibatch = phase * minibatches_per_phase - 1; - if (tot_aux_objf_this_phase == 0.0) { + if (tot_aux_objfs_this_phase.IsZero()) { if (minibatches_per_phase == minibatches_this_phase) { KALDI_LOG << "Average objective function for '" << output_name << "' for minibatches " << start_minibatch @@ -316,41 +393,56 @@ void ObjectiveFunctionInfo::PrintStatsForThisPhase( << tot_weight_this_phase << " frames."; } } else { - BaseFloat objf = (tot_objf_this_phase / tot_weight_this_phase), - aux_objf = (tot_aux_objf_this_phase / tot_weight_this_phase), - sum_objf = objf + aux_objf; + BaseFloat objf = (tot_objf_this_phase / tot_weight_this_phase); + ObjectiveValues aux_objfs(tot_aux_objfs_this_phase); + aux_objfs.Scale(1.0 / tot_weight_this_phase); + BaseFloat sum_objf = objf + aux_objfs.Sum(); if (minibatches_per_phase == minibatches_this_phase) { KALDI_LOG << "Average objective function for '" << output_name << "' for minibatches " << start_minibatch << '-' << end_minibatch << " is " - << objf << " + " << aux_objf << " = " << sum_objf + << objf << " + " << aux_objfs.Str() << " = " << sum_objf << " over " << tot_weight_this_phase << " frames."; } else { KALDI_LOG << "Average objective function for '" << output_name << "' using " << minibatches_this_phase << " minibatches in minibatch range " << start_minibatch << '-' << end_minibatch << " is " - << objf << " + " << aux_objf << " = " << sum_objf + << objf << " + " << aux_objfs.Str() << " = " << sum_objf << " over " << tot_weight_this_phase << " frames."; } } } bool ObjectiveFunctionInfo::PrintTotalStats(const std::string &name) const { - BaseFloat objf = (tot_objf / tot_weight), - aux_objf = (tot_aux_objf / tot_weight), - sum_objf = objf + aux_objf; - if (tot_aux_objf == 0.0) { + BaseFloat objf = (tot_objf / tot_weight); + ObjectiveValues aux_objfs(tot_aux_objfs); + aux_objfs.Scale(1.0 / tot_weight); + BaseFloat sum_objf = objf + aux_objfs.Sum(); + + // Remove scales for the purpose of printing + if (objf_scale != 0.0) objf /= objf_scale; + aux_objfs.InvScale(aux_objf_scales); + + if (tot_aux_objfs.IsZero()) { KALDI_LOG << "Overall average objective function for '" << name << "' is " - << (tot_objf / tot_weight) << " over " << tot_weight << " frames."; + << objf << " over " << tot_weight << " frames."; } else { KALDI_LOG << "Overall average objective function for '" << name << "' is " - << objf << " + " << aux_objf << " = " << sum_objf + << objf << " + " << aux_objfs.Str() << " = " << sum_objf << " over " << tot_weight << " frames."; } + + if (deriv_sum.Dim() > 0) { + Vector deriv_avg(deriv_sum); + deriv_avg.Scale(1.0 / tot_weight); + KALDI_LOG << "Overall avg deriv for " << name << " is " << deriv_avg; + } + KALDI_LOG << "[this line is to be parsed by a script:] " << "log-prob-per-frame=" << objf; + return (tot_weight != 0.0); } diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h index fffc621930a..ac0705f5eb6 100644 --- a/src/nnet3/nnet-training.h +++ b/src/nnet3/nnet-training.h @@ -44,6 +44,7 @@ struct NnetTrainerOptions { std::string write_cache; bool binary_write_cache; BaseFloat max_param_change; + std::string objective_scales_str; NnetOptimizeOptions optimize_config; NnetComputeOptions compute_config; CachingOptimizingCompilerOptions compiler_config; @@ -104,6 +105,10 @@ struct NnetTrainerOptions { "the cached computation."); opts->Register("binary-write-cache", &binary_write_cache, "Write " "computation cache in binary mode"); + opts->Register("objective-scales", &objective_scales_str, + "Objective scales for the outputs specified as " + "a comma-separated list of pairs " + ":,:..."); // register the optimization options with the prefix "optimization". ParseOptions optimization_opts("optimization", opts); @@ -116,6 +121,39 @@ struct NnetTrainerOptions { } }; +// This struct is used to store multiple objective function values +// and do basic operations on all of them. +struct ObjectiveValues { + std::vector objective_values; + + ObjectiveValues() { } + + ObjectiveValues(const std::vector &values): + objective_values(values) { } + + ObjectiveValues(const std::vector &values); + + int32 Size() const { return objective_values.size(); } + void Resize(int32 size); + + void Add(const ObjectiveValues &other); + + void Scale(BaseFloat scale); + + void InvScale(BaseFloat inv_scale); + + void InvScale(const std::vector &inv_scales); + + void Reset() { Scale(0.0); } + + bool IsZero() const; + + double Sum() const; + + std::string Str() const; +}; + + // This struct is used in multiple nnet training classes for keeping // track of objective function values. // Also see struct AccuracyInfo, in nnet-diagnostics.h. @@ -126,20 +164,35 @@ struct ObjectiveFunctionInfo { // 'current_phase'. double tot_weight; double tot_objf; - double tot_aux_objf; // An 'auxiliary' objective function that is optional- - // may be used when things like regularization are being - // used. + + // A struct used to store 'auxiliary' objective function values + // that is optional- may be used when things like regularization are being + // used. + ObjectiveValues tot_aux_objfs; double tot_weight_this_phase; double tot_objf_this_phase; - double tot_aux_objf_this_phase; + ObjectiveValues tot_aux_objfs_this_phase; + + CuVector deriv_sum; + + BaseFloat objf_scale; + std::vector aux_objf_scales; ObjectiveFunctionInfo(): current_phase(0), minibatches_this_phase(0), - tot_weight(0.0), tot_objf(0.0), tot_aux_objf(0.0), + tot_weight(0.0), tot_objf(0.0), + tot_weight_this_phase(0.0), tot_objf_this_phase(0.0), + objf_scale(1.0) { } + + ObjectiveFunctionInfo(BaseFloat objf_scale, + const std::vector aux_objf_scales): + current_phase(0), + minibatches_this_phase(0), + tot_weight(0.0), tot_objf(0.0), tot_weight_this_phase(0.0), tot_objf_this_phase(0.0), - tot_aux_objf_this_phase(0.0) { } + objf_scale(objf_scale), aux_objf_scales(aux_objf_scales) { } // This function updates the stats and, if the phase has just changed, // prints a message indicating progress. The phase equals @@ -150,7 +203,8 @@ struct ObjectiveFunctionInfo { int32 minibatch_counter, BaseFloat this_minibatch_weight, BaseFloat this_minibatch_tot_objf, - BaseFloat this_minibatch_tot_aux_objf = 0.0); + const ObjectiveValues &this_minibatch_tot_aux_objfs + = ObjectiveValues()); // Prints stats for the current phase. // Note: 'phase' will normally be this->current_phase + 1, but may under diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index d16a728e2ab..1f3f35611b6 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -2151,6 +2151,26 @@ void ApplyL2Regularization(const Nnet &nnet, } } +void ParseObjectiveScales( + const std::string &objective_scales_str, + std::unordered_map *objective_scales) { + objective_scales->clear(); + + std::vector objectives_for_outputs; + SplitStringToVector(objective_scales_str, ", ", false, + &objectives_for_outputs); + std::vector::const_iterator it = objectives_for_outputs.begin(); + for (; it != objectives_for_outputs.end(); ++it) { + std::vector this_output_objective; + SplitStringToVector(*it, ":", false, + &this_output_objective); + + BaseFloat scale; + ConvertStringToReal(this_output_objective[1], &scale); + objective_scales->insert( + std::make_pair(this_output_objective[0], scale)); + } +} } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index c54fcf87e64..665a43c5a39 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -499,6 +499,9 @@ void ConstrainOrthonormal(Nnet *nnet); int32 GetNumNvalues(const std::vector &io_vec, bool exhaustive); +void ParseObjectiveScales( + const std::string &objective_scales_str, + std::unordered_map *objective_scales); } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc index fed6d529a82..c26e8ea5978 100644 --- a/src/nnet3bin/nnet3-get-egs.cc +++ b/src/nnet3bin/nnet3-get-egs.cc @@ -166,7 +166,7 @@ int main(int argc, char *argv[]) { bool compress = true; int32 num_pdfs = -1, length_tolerance = 100, - targets_length_tolerance = 2, + targets_length_tolerance = 2, online_ivector_period = 1; ExampleGenerationConfig eg_config; // controls num-frames, @@ -192,7 +192,7 @@ int main(int argc, char *argv[]) { "--online-ivectors option"); po.Register("length-tolerance", &length_tolerance, "Tolerance for " "difference in num-frames between feat and ivector matrices"); - po.Register("targets-length-tolerance", &targets_length_tolerance, + po.Register("targets-length-tolerance", &targets_length_tolerance, "Tolerance for " "difference in num-frames (after subsampling) between " "feature matrix and posterior"); @@ -260,7 +260,7 @@ int main(int argc, char *argv[]) { } if (!ProcessFile(feats, online_ivector_feats, online_ivector_period, - pdf_post, key, compress, num_pdfs, + pdf_post, key, compress, num_pdfs, targets_length_tolerance, &utt_splitter, &example_writer)) num_err++; diff --git a/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm.cc b/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm.cc index 5d500b5651d..54610639857 100644 --- a/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm.cc +++ b/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm.cc @@ -47,9 +47,11 @@ int main(int argc, char *argv[]) { " data/lang/words.txt ark:in.lats data/tensorflow_lstm/rnnlm ark:out.lats\n"; ParseOptions po(usage); + bool write_compact = true; int32 max_ngram_order = 3; BaseFloat lm_scale = 0.5; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("lm-scale", &lm_scale, "Scaling factor for language model " "costs"); po.Register("max-ngram-order", &max_ngram_order, @@ -87,14 +89,45 @@ int main(int argc, char *argv[]) { KaldiTfRnnlmWrapper rnnlm(opts, rnn_word_list, word_symbols_rxfilename, unk_prob_file, rnnlm_rxfilename); - // Reads and writes as compact lattice. - SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); + SequentialCompactLatticeReader compact_lattice_reader; + SequentialLatticeReader lattice_reader; + + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + + if (write_compact) { + compact_lattice_reader.Open(lats_rspecifier); + compact_lattice_writer.Open(lats_wspecifier); + } else { + lattice_reader.Open(lats_rspecifier); + lattice_writer.Open(lats_wspecifier); + } int32 n_done = 0, n_fail = 0; - for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { - std::string key = compact_lattice_reader.Key(); - CompactLattice &clat = compact_lattice_reader.Value(); + for (; write_compact ? !compact_lattice_reader.Done() : !lattice_reader.Done(); + write_compact ? compact_lattice_reader.Next() : lattice_reader.Next()) { + std::string key; + CompactLattice clat; + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + + if (write_compact) { + key = compact_lattice_reader.Key(); + clat = compact_lattice_reader.Value(); + compact_lattice_reader.FreeCurrent(); + } else { + key = lattice_reader.Key(); + const Lattice &lat = lattice_reader.Value(); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + ComputeAcousticScoresMap(lat, &acoustic_scores); + + ConvertLattice(lat, &clat); + + lattice_reader.FreeCurrent(); + } if (lm_scale != 0.0) { // Before composing with the LM FST, we scale the lattice weights @@ -125,13 +158,34 @@ int main(int argc, char *argv[]) { << " (incompatible LM?)"; n_fail++; } else { - compact_lattice_writer.Write(key, determinized_clat); + if (write_compact) { + compact_lattice_writer.Write(key, determinized_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(determinized_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } n_done++; } } else { // Zero scale so nothing to do. n_done++; - compact_lattice_writer.Write(key, clat); + + if (write_compact) { + compact_lattice_writer.Write(key, clat); + } else { + Lattice out_lat; + fst::ConvertLattice(clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } } }