From 75dcf22039a4ded47ef133cf48c32489bcc36336 Mon Sep 17 00:00:00 2001 From: Srikanth MADIKERI Date: Fri, 7 Jun 2019 11:58:45 +0200 Subject: [PATCH 001/196] replicating mini_librispeech/s5 to mini_librispeech/s5b --- egs/mini_librispeech/s5b/cmd.sh | 15 + egs/mini_librispeech/s5b/conf/decode.config | 1 + egs/mini_librispeech/s5b/conf/mfcc.conf | 1 + egs/mini_librispeech/s5b/conf/mfcc_hires.conf | 10 + .../s5b/conf/online_cmvn.conf | 1 + .../s5b/local/chain/compare_wer.sh | 137 ++++++++ .../local/chain/diagnostic/report_example.py | 80 +++++ .../s5b/local/chain/run_cnn_tdnn.sh | 1 + .../s5b/local/chain/run_tdnn.sh | 1 + .../local/chain/run_tdnn_discriminative.sh | 1 + .../s5b/local/chain/tuning/run_cnn_tdnn_1a.sh | 307 ++++++++++++++++ .../s5b/local/chain/tuning/run_tdnn_1a.sh | 298 ++++++++++++++++ .../s5b/local/chain/tuning/run_tdnn_1b.sh | 306 ++++++++++++++++ .../s5b/local/chain/tuning/run_tdnn_1c.sh | 302 ++++++++++++++++ .../tuning/run_tdnn_1c_discriminative.sh | 250 +++++++++++++ .../s5b/local/chain/tuning/run_tdnn_1d.sh | 304 ++++++++++++++++ .../s5b/local/chain/tuning/run_tdnn_1e.sh | 303 ++++++++++++++++ .../s5b/local/chain/tuning/run_tdnn_1f.sh | 311 +++++++++++++++++ .../s5b/local/chain/tuning/run_tdnn_1g.sh | 311 +++++++++++++++++ .../s5b/local/chain/tuning/run_tdnn_1g20.sh | 324 +++++++++++++++++ .../s5b/local/chain/tuning/run_tdnn_1h.sh | 300 ++++++++++++++++ egs/mini_librispeech/s5b/local/data_prep.sh | 1 + .../s5b/local/download_and_untar.sh | 100 ++++++ egs/mini_librispeech/s5b/local/download_lm.sh | 74 ++++ egs/mini_librispeech/s5b/local/format_lms.sh | 1 + .../s5b/local/grammar/extend_vocab_demo.sh | 328 ++++++++++++++++++ .../grammar/extend_vocab_demo_silprobs.sh | 326 +++++++++++++++++ .../s5b/local/grammar/simple_demo.sh | 177 ++++++++++ .../s5b/local/grammar/simple_demo_silprobs.sh | 175 ++++++++++ .../s5b/local/kws/compile_keywords.sh | 59 ++++ .../s5b/local/kws/create_categories.pl | 112 ++++++ .../s5b/local/kws/create_hitlist.sh | 72 ++++ .../s5b/local/kws/example/keywords.txt | 7 + .../s5b/local/kws/filter_kws_results.pl | 189 ++++++++++ .../s5b/local/kws/generate_hitlist.pl | 117 +++++++ .../s5b/local/kws/keywords_to_indices.pl | 123 +++++++ .../s5b/local/kws/make_L_align.sh | 59 ++++ .../s5b/local/kws/normalize_results_kst.pl | 203 +++++++++++ egs/mini_librispeech/s5b/local/kws/run_kws.sh | 108 ++++++ egs/mini_librispeech/s5b/local/kws/score.sh | 147 ++++++++ egs/mini_librispeech/s5b/local/kws/search.sh | 208 +++++++++++ .../s5b/local/nnet3/compare_wer.sh | 132 +++++++ .../s5b/local/nnet3/run_ivector_common.sh | 148 ++++++++ .../s5b/local/nnet3/run_tdnn_lstm.sh | 1 + .../local/nnet3/tuning/run_tdnn_lstm_1a.sh | 223 ++++++++++++ .../local/nnet3/tuning/run_tdnn_lstm_1b.sh | 228 ++++++++++++ .../local/nnet3/tuning/run_tdnn_lstm_1c.sh | 227 ++++++++++++ .../s5b/local/prepare_dict.sh | 1 + egs/mini_librispeech/s5b/local/score.sh | 63 ++++ .../s5b/local/subset_dataset.sh | 48 +++ egs/mini_librispeech/s5b/path.sh | 9 + egs/mini_librispeech/s5b/run.sh | 205 +++++++++++ egs/mini_librispeech/s5b/steps | 1 + egs/mini_librispeech/s5b/utils | 1 + 54 files changed, 7437 insertions(+) create mode 100644 egs/mini_librispeech/s5b/cmd.sh create mode 100644 egs/mini_librispeech/s5b/conf/decode.config create mode 100644 egs/mini_librispeech/s5b/conf/mfcc.conf create mode 100644 egs/mini_librispeech/s5b/conf/mfcc_hires.conf create mode 100644 egs/mini_librispeech/s5b/conf/online_cmvn.conf create mode 100755 egs/mini_librispeech/s5b/local/chain/compare_wer.sh create mode 100755 egs/mini_librispeech/s5b/local/chain/diagnostic/report_example.py create mode 120000 egs/mini_librispeech/s5b/local/chain/run_cnn_tdnn.sh create mode 120000 egs/mini_librispeech/s5b/local/chain/run_tdnn.sh create mode 120000 egs/mini_librispeech/s5b/local/chain/run_tdnn_discriminative.sh create mode 100755 egs/mini_librispeech/s5b/local/chain/tuning/run_cnn_tdnn_1a.sh create mode 100755 egs/mini_librispeech/s5b/local/chain/tuning/run_tdnn_1a.sh create mode 100755 egs/mini_librispeech/s5b/local/chain/tuning/run_tdnn_1b.sh create mode 100755 egs/mini_librispeech/s5b/local/chain/tuning/run_tdnn_1c.sh create mode 100755 egs/mini_librispeech/s5b/local/chain/tuning/run_tdnn_1c_discriminative.sh create mode 100755 egs/mini_librispeech/s5b/local/chain/tuning/run_tdnn_1d.sh create mode 100755 egs/mini_librispeech/s5b/local/chain/tuning/run_tdnn_1e.sh create mode 100755 egs/mini_librispeech/s5b/local/chain/tuning/run_tdnn_1f.sh create mode 100755 egs/mini_librispeech/s5b/local/chain/tuning/run_tdnn_1g.sh create mode 100755 egs/mini_librispeech/s5b/local/chain/tuning/run_tdnn_1g20.sh create mode 100755 egs/mini_librispeech/s5b/local/chain/tuning/run_tdnn_1h.sh create mode 120000 egs/mini_librispeech/s5b/local/data_prep.sh create mode 100755 egs/mini_librispeech/s5b/local/download_and_untar.sh create mode 100755 egs/mini_librispeech/s5b/local/download_lm.sh create mode 120000 egs/mini_librispeech/s5b/local/format_lms.sh create mode 100755 egs/mini_librispeech/s5b/local/grammar/extend_vocab_demo.sh create mode 100755 egs/mini_librispeech/s5b/local/grammar/extend_vocab_demo_silprobs.sh create mode 100755 egs/mini_librispeech/s5b/local/grammar/simple_demo.sh create mode 100755 egs/mini_librispeech/s5b/local/grammar/simple_demo_silprobs.sh create mode 100755 egs/mini_librispeech/s5b/local/kws/compile_keywords.sh create mode 100755 egs/mini_librispeech/s5b/local/kws/create_categories.pl create mode 100755 egs/mini_librispeech/s5b/local/kws/create_hitlist.sh create mode 100644 egs/mini_librispeech/s5b/local/kws/example/keywords.txt create mode 100755 egs/mini_librispeech/s5b/local/kws/filter_kws_results.pl create mode 100755 egs/mini_librispeech/s5b/local/kws/generate_hitlist.pl create mode 100755 egs/mini_librispeech/s5b/local/kws/keywords_to_indices.pl create mode 100755 egs/mini_librispeech/s5b/local/kws/make_L_align.sh create mode 100755 egs/mini_librispeech/s5b/local/kws/normalize_results_kst.pl create mode 100755 egs/mini_librispeech/s5b/local/kws/run_kws.sh create mode 100755 egs/mini_librispeech/s5b/local/kws/score.sh create mode 100755 egs/mini_librispeech/s5b/local/kws/search.sh create mode 100755 egs/mini_librispeech/s5b/local/nnet3/compare_wer.sh create mode 100755 egs/mini_librispeech/s5b/local/nnet3/run_ivector_common.sh create mode 120000 egs/mini_librispeech/s5b/local/nnet3/run_tdnn_lstm.sh create mode 100755 egs/mini_librispeech/s5b/local/nnet3/tuning/run_tdnn_lstm_1a.sh create mode 100755 egs/mini_librispeech/s5b/local/nnet3/tuning/run_tdnn_lstm_1b.sh create mode 100755 egs/mini_librispeech/s5b/local/nnet3/tuning/run_tdnn_lstm_1c.sh create mode 120000 egs/mini_librispeech/s5b/local/prepare_dict.sh create mode 100755 egs/mini_librispeech/s5b/local/score.sh create mode 100755 egs/mini_librispeech/s5b/local/subset_dataset.sh create mode 100644 egs/mini_librispeech/s5b/path.sh create mode 100755 egs/mini_librispeech/s5b/run.sh create mode 120000 egs/mini_librispeech/s5b/steps create mode 120000 egs/mini_librispeech/s5b/utils diff --git a/egs/mini_librispeech/s5b/cmd.sh b/egs/mini_librispeech/s5b/cmd.sh new file mode 100644 index 000000000..71dd849a9 --- /dev/null +++ b/egs/mini_librispeech/s5b/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/mini_librispeech/s5b/conf/decode.config b/egs/mini_librispeech/s5b/conf/decode.config new file mode 100644 index 000000000..7ba966f2b --- /dev/null +++ b/egs/mini_librispeech/s5b/conf/decode.config @@ -0,0 +1 @@ +# empty config, just use the defaults. diff --git a/egs/mini_librispeech/s5b/conf/mfcc.conf b/egs/mini_librispeech/s5b/conf/mfcc.conf new file mode 100644 index 000000000..736150909 --- /dev/null +++ b/egs/mini_librispeech/s5b/conf/mfcc.conf @@ -0,0 +1 @@ +--use-energy=false # only non-default option. diff --git a/egs/mini_librispeech/s5b/conf/mfcc_hires.conf b/egs/mini_librispeech/s5b/conf/mfcc_hires.conf new file mode 100644 index 000000000..434834a67 --- /dev/null +++ b/egs/mini_librispeech/s5b/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/mini_librispeech/s5b/conf/online_cmvn.conf b/egs/mini_librispeech/s5b/conf/online_cmvn.conf new file mode 100644 index 000000000..7748a4a4d --- /dev/null +++ b/egs/mini_librispeech/s5b/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/mini_librispeech/s5b/local/chain/compare_wer.sh b/egs/mini_librispeech/s5b/local/chain/compare_wer.sh new file mode 100755 index 000000000..8ee5db232 --- /dev/null +++ b/egs/mini_librispeech/s5b/local/chain/compare_wer.sh @@ -0,0 +1,137 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev_clean_2 (tgsmall) " + "#WER dev_clean_2 (tglarge) ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo diff --git a/egs/mini_librispeech/s5b/local/chain/diagnostic/report_example.py b/egs/mini_librispeech/s5b/local/chain/diagnostic/report_example.py new file mode 100755 index 000000000..f5cc954c7 --- /dev/null +++ b/egs/mini_librispeech/s5b/local/chain/diagnostic/report_example.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 + +# I ran from the shell: +# . ./path.sh +# steps/nnet3/report/convert_model.py exp/chain/tdnn1g_sp/24.mdl{,.pkl} +# steps/nnet3/report/convert_model.py exp/chain/tdnn1g_sp/25.mdl{,.pkl} +# .. and then this script: +# local/chain/diagnostic/report_example.py + +# Note: I make no claim that the information in the generated report is +# understandable in general; it's just something I was plotting for +# my own information. The point of this script is to demonstrate +# how to use steps/nnet3/report/convert_model.py. + +import sys +sys.path.append("steps/nnet3/report") +import convert_model +import matplotlib as mpl +mpl.use('Agg') +import matplotlib.pyplot as plt +import pickle +import numpy as np + +# instead of the pickle.load commands, you could do in python, as follows: +# (but dumping them to disk first is faster in case you'll be running this +# script more than once). +# model1 = convert_model.read_model("exp/chain/tdnn1g_sp/24.mdl") +# model2 = convert_model.read_model("exp/chain/tdnn1g_sp/25.mdl") +model1 = pickle.load(open("exp/chain/tdnn1g_sp/24.mdl.pkl", "rb")) +model2 = pickle.load(open("exp/chain/tdnn1g_sp/25.mdl.pkl", "rb")) + +convert_model.compute_derived_quantities(model1) +convert_model.compute_derived_quantities(model2) +convert_model.compute_progress(model1, model2) + + +f, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(nrows=2, ncols=3) +plt.tight_layout() +fs=5 +ss=4 +ax1.scatter(model1['tdnn4.affine']['col-norms-3'], + model1['tdnn3.affine']['row-change'], s=ss) +ax1.set_title('row-change3 versus column-norms4', + fontsize=fs) + +ax2.scatter(model1['tdnn4.affine']['col-norms-3'], + model1['tdnn3.affine']['rel-row-change'], s=ss) +ax2.set_title('rel-row-change3 versus column-norms4', + fontsize=fs) + +ax3.scatter(model1['tdnn4.affine']['col-norms-3'], + model1['tdnn3.affine']['row-norms'], s=ss) +ax3.set_title('row-norms3 versus column-norms4', + fontsize=fs) + +ax4.scatter(model1['tdnn4.affine']['col-norms'], + model1['tdnn4.affine']['rel-col-change'], s=ss) +ax4.set_title('rel-col-change4 versus col-norms4', + fontsize=fs) + +ax5.scatter(model1['tdnn3.batchnorm']['stats-stddev'], + model1['tdnn4.affine']['col-norms-3'], s=ss) +ax5.set_title('col-norms4 versus batch-norm-stddev3', + fontsize=fs) + + +#ax6.scatter(np.reciprocal(model1['tdnn3.relu']['deriv-avg']) * model1['tdnn4.affine']['col-norms-3'], +# model1['tdnn3.affine']['row-norms'], s=ss) +#ax6.set_title('row-norms3 vs predicted-row-norms3', +# fontsize=fs) + +ax6.scatter(model2['tdnn3.relu']['deriv-avg'] * model2['tdnn3.relu']['oderiv-rms'], + # model1['tdnn3.relu']['oderiv-rms'], + model2['tdnn3.affine']['row-norms'], s=ss) +ax6.set_xlim(left=0.00, right=0.009) +ax6.set_title('row-norms3 vs ideriv-rms3', + fontsize=fs) + + +plt.savefig('progress.pdf') diff --git a/egs/mini_librispeech/s5b/local/chain/run_cnn_tdnn.sh b/egs/mini_librispeech/s5b/local/chain/run_cnn_tdnn.sh new file mode 120000 index 000000000..ab83f3c43 --- /dev/null +++ b/egs/mini_librispeech/s5b/local/chain/run_cnn_tdnn.sh @@ -0,0 +1 @@ +tuning/run_cnn_tdnn_1a.sh \ No newline at end of file diff --git a/egs/mini_librispeech/s5b/local/chain/run_tdnn.sh b/egs/mini_librispeech/s5b/local/chain/run_tdnn.sh new file mode 120000 index 000000000..3922170ac --- /dev/null +++ b/egs/mini_librispeech/s5b/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1h.sh \ No newline at end of file diff --git a/egs/mini_librispeech/s5b/local/chain/run_tdnn_discriminative.sh b/egs/mini_librispeech/s5b/local/chain/run_tdnn_discriminative.sh new file mode 120000 index 000000000..2bae0a796 --- /dev/null +++ b/egs/mini_librispeech/s5b/local/chain/run_tdnn_discriminative.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1c_discriminative.sh \ No newline at end of file diff --git a/egs/mini_librispeech/s5b/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/mini_librispeech/s5b/local/chain/tuning/run_cnn_tdnn_1a.sh new file mode 100755 index 000000000..c8f2503b5 --- /dev/null +++ b/egs/mini_librispeech/s5b/local/chain/tuning/run_cnn_tdnn_1a.sh @@ -0,0 +1,307 @@ +#!/bin/bash + +# run_cnn_tdnn_1a.sh is modified from run_tdnn_1h.sh, but adding CNN layers +# near the beginning. + +# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/cnn_tdnn1a_sp +# System tdnn1h_sp cnn_tdnn1a_sp +#WER dev_clean_2 (tgsmall) 12.09 11.15 +# [online:] 12.11 11.17 +#WER dev_clean_2 (tglarge) 8.59 7.79 +# [online:] 8.76 7.80 +# Final train prob -0.0493 -0.0467 +# Final valid prob -0.0805 -0.0789 +# Final train prob (xent) -1.1730 -1.0767 +# Final valid prob (xent) -1.3872 -1.3070 +# Num-params 5207856 4492816 + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +dropout_schedule='0,0@0.20,0.3@0.50,0' +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + cnn_opts="l2-regularize=0.03" + ivector_affine_opts="l2-regularize=0.03" + tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_first_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.0" + tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.015" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + + linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0) + batchnorm-component name=ivector-batchnorm target-rms=0.025 + + batchnorm-component name=idct-batchnorm input=idct + combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40 + + conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25 + conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 + conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + + # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the + # information bottleneck doesn't become a problem. (we use time-stride=0 so no splicing, to + # limit the num-parameters). + tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + ## adding the layers for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + # adding the layers for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/fs0{1,2}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 2309 combine=-0.072->-0.069 xent:train/valid[3,5,final]=(-2.10,-1.62,-1.48/-2.26,-1.85,-1.77) logprob:train/valid[3,5,final]=(-0.096,-0.069,-0.060/-0.124,-0.107,-0.104) + +# local/chain/compare_wer.sh --online exp/chain/tdnn1a_sp +# System tdnn1a_sp +#WER dev_clean_2 (tgsmall) 18.58 +# [online:] 18.49 +#WER dev_clean_2 (tglarge) 13.35 +# [online:] 13.47 +# Final train prob -0.0596 +# Final valid prob -0.1036 +# Final train prob (xent) -1.4843 +# Final valid prob (xent) -1.7723 + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + relu-renorm-layer name=tdnn3 dim=512 input=Append(-1,0,1) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn6 dim=512 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 2309 combine=-0.085->-0.067 xent:train/valid[10,16,final]=(-1.98,-1.54,-1.37/-2.12,-1.76,-1.65) logprob:train/valid[10,16,final]=(-0.104,-0.076,-0.064/-0.129,-0.105,-0.101) + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1b # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + relu-renorm-layer name=tdnn3 dim=512 input=Append(-1,0,1) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn6 dim=512 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.optimization.proportional-shrink=150.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 2353 combine=-0.088->-0.064 xent:train/valid[10,16,final]=(-2.32,-1.61,-1.35/-2.49,-1.86,-1.63) logprob:train/valid[10,16,final]=(-0.164,-0.079,-0.061/-0.196,-0.117,-0.096) +# exp/chain/tdnn1c_sp: num-iters=17 nj=2..5 num-params=7.0M dim=40+100->2353 combine=-0.061->-0.050 xent:train/valid[10,16,final]=(-1.56,-1.17,-1.06/-1.85,-1.53,-1.46) logprob:train/valid[10,16,final]=(-0.081,-0.053,-0.046/-0.120,-0.096,-0.090) + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1c # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=512 + relu-batchnorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn6 dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=512 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.optimization.proportional-shrink=150.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + data_dirs= + for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do + steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 30 \ + $x $train_data_dir exp/shift_hires/ mfcc_hires + utils/fix_data_dir.sh ${train_data_dir}_fs$x + data_dirs="$data_dirs ${train_data_dir}_fs$x" + awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp + done + utils/combine_data.sh ${train_data_dir}_fs $data_dirs + for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do + rm -r ${train_data_dir}_fs$x + done + fi + + train_data_dir=${train_data_dir}_fs + + affix=_fs +fi + +rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true +for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do + awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp +done +online_ivector_dir=${online_ivector_dir}_fs + +if [ $stage -le 1 ]; then + # hardcode no-GPU for alignment, although you could use GPU [you wouldn't + # get excellent GPU utilization though.] + nj=350 # have a high number of jobs because this could take a while, and we might + # have some stragglers. + steps/nnet3/align.sh --cmd "$decode_cmd" --use-gpu false \ + --online-ivector-dir $online_ivector_dir \ + --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \ + --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ; +fi + +if [ -z "$lats_dir" ]; then + lats_dir=${srcdir}_denlats${affix} + if [ $stage -le 2 ]; then + nj=50 + # this doesn't really affect anything strongly, except the num-jobs for one of + # the phases of get_egs_discriminative.sh below. + num_threads_denlats=6 + subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving + # total slots = 80 * 6 = 480. + steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \ + --self-loop-scale 1.0 --acwt 1.0 --determinize true \ + --online-ivector-dir $online_ivector_dir \ + --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \ + $train_data_dir $lang $srcdir ${lats_dir} ; + fi +fi + +model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` +model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] + +cmvn_opts=`cat $srcdir/cmvn_opts` + +if [ -z "$degs_dir" ]; then + degs_dir=${srcdir}_degs${affix} + + if [ $stage -le 3 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage + fi + # have a higher maximum num-jobs if + if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi + + steps/nnet3/get_egs_discriminative.sh \ + --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \ + --adjust-priors false --acwt 1.0 \ + --online-ivector-dir $online_ivector_dir \ + --left-context $left_context --right-context $right_context \ + $frame_subsampling_opt \ + --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \ + $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ; + fi +fi + +if [ $stage -le 4 ]; then + steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \ + --stage $train_stage \ + --effective-lrate $effective_learning_rate --max-param-change $max_param_change \ + --criterion $criterion --drop-frames true --acoustic-scale 1.0 \ + --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \ + --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \ + --regularization-opts "$regularization_opts" --use-frame-shift false \ + ${degs_dir} $dir ; +fi + +# decode the adjusted model +if [ $stage -le 5 ]; then + rm $dir/.error 2>/dev/null || true + + for x in `seq $decode_start_epoch $num_epochs`; do + for data in dev_clean_2; do + ( + iter=epoch${x}_adj + nspk=$(wc -l /dev/null || true + + for x in `seq $decode_start_epoch $num_epochs`; do + for data in dev_clean_2; do + ( + iter=epoch${x} + nspk=$(wc -l 2353 combine=-0.061->-0.050 xent:train/valid[10,16,final]=(-1.56,-1.17,-1.06/-1.85,-1.53,-1.46) logprob:train/valid[10,16,final]=(-0.081,-0.053,-0.046/-0.120,-0.096,-0.090) +# exp/chain/tdnn1d_sp: num-iters=17 nj=2..5 num-params=7.6M dim=40+100->2353 combine=-0.061->-0.052 xent:train/valid[10,16,final]=(-1.69,-1.24,-1.08/-1.94,-1.57,-1.45) logprob:train/valid[10,16,final]=(-0.089,-0.057,-0.048/-0.127,-0.099,-0.088) + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1d # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=512 + relu-batchnorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 dim=512 + relu-batchnorm-layer name=tdnn4 dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn5 dim=512 + relu-batchnorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn7 dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn8 dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 dim=512 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.optimization.proportional-shrink=150.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 2309 combine=-0.063->-0.052 xent:train/valid[10,16,final]=(-1.65,-1.23,-1.08/-1.91,-1.55,-1.42) logprob:train/valid[10,16,final]=(-0.084,-0.057,-0.047/-0.125,-0.100,-0.089) +# exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.061->-0.056 xent:train/valid[10,16,final]=(-1.69,-1.41,-1.41/-1.91,-1.67,-1.66) logprob:train/valid[10,16,final]=(-0.065,-0.055,-0.051/-0.104,-0.095,-0.089) + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1e # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + opts="l2-regularize=0.05" + output_opts="l2-regularize=0.01" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=512 + relu-batchnorm-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $opts dim=512 + relu-batchnorm-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn5 $opts dim=512 + relu-batchnorm-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 target-rms=0.5 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null +# local/chain/compare_wer.sh --online exp/chain/tdnn1e_sp exp/chain/tdnn1f_sp +# System tdnn1e_sp tdnn1f_sp +#WER dev_clean_2 (tgsmall) 14.11 13.91 +# [online:] 14.07 13.96 +#WER dev_clean_2 (tglarge) 10.15 9.95 +# [online:] 10.16 10.13 +# Final train prob -0.0503 -0.0508 +# Final valid prob -0.0887 -0.0917 +# Final train prob (xent) -1.4257 -1.3509 +# Final valid prob (xent) -1.6799 -1.5883 +# Num-params 7508490 4205322 + + +# steps/info/chain_dir_info.pl exp/chain/tdnn1{e,f}_sp +# exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.057->-0.057 (over 1) xent:train/valid[10,16,final]=(-1.73,-1.46,-1.43/-1.94,-1.72,-1.68) logprob:train/valid[10,16,final]=(-0.067,-0.055,-0.050/-0.105,-0.095,-0.089) +# exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.2M dim=40+100->2309 combine=-0.060->-0.060 (over 2) xent:train/valid[10,16,final]=(-1.60,-1.39,-1.35/-1.81,-1.64,-1.59) logprob:train/valid[10,16,final]=(-0.068,-0.056,-0.051/-0.104,-0.097,-0.092) + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1f # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + opts="l2-regularize=0.05" + output_opts="l2-regularize=0.02 bottleneck-dim=192" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=384 + relu-batchnorm-layer name=tdnn2 $opts dim=384 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $opts dim=384 + relu-batchnorm-layer name=tdnn4 $opts dim=384 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn5 $opts dim=384 + relu-batchnorm-layer name=tdnn6 $opts dim=384 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn7 $opts dim=384 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=384 + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=384 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l