kaldi-asr · vimalmanohar · Aug 11, 2015 · Aug 11, 2015 · Aug 14, 2015 · Aug 18, 2015
diff --git a/egs/aspire/s5/cmd.sh b/egs/aspire/s5/cmd.sh
@@ -6,11 +6,11 @@
 # the number of cpus on your machine.
 
 #a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
+export train_cmd="queue.pl" 
+export decode_cmd="queue.pl --mem 2G"
+export mkgraph_cmd="queue.pl --mem 4G"
 
-export cuda_cmd="queue.pl -l gpu=1 -q g.q"
+export cuda_cmd="queue.pl --gpu 1"
 
 
 #b) BUT cluster options

diff --git a/egs/aspire/s5/conf/fbank.conf b/egs/aspire/s5/conf/fbank.conf
@@ -0,0 +1,6 @@
+# config for high-resolution Fbank features
+--use-energy=false   # do not add energy 
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
diff --git a/egs/aspire/s5/conf/fbank_bp.conf b/egs/aspire/s5/conf/fbank_bp.conf
@@ -0,0 +1,8 @@
+# config for high-resolution Fbank features
+--use-energy=false   # do not add energy 
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=152    # similar to Google's setup.
+--num-fft-bins=512
+--low-freq=330    # low cutoff frequency for mel bins
+--high-freq=-1000 # high cutoff frequently, relative to Nyquist of 4000 (=3000)
+
diff --git a/egs/aspire/s5/conf/mfcc_diarization.conf b/egs/aspire/s5/conf/mfcc_diarization.conf
@@ -0,0 +1,6 @@
+--sample-frequency=8000 
+--frame-length=25 # the default is 25, but we usually use 20 for SID
+--low-freq=20 # the default.
+--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case).
+--num-ceps=20 # higher than the default which is 12.
+--snip-edges=false
diff --git a/egs/aspire/s5/conf/mfcc_hires_bp.conf b/egs/aspire/s5/conf/mfcc_hires_bp.conf
@@ -0,0 +1,12 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=152     # similar to Google's setup.
+--num-ceps=152         # there is no dimensionality reduction.
+--num-fft-bins=512
+--low-freq=330    # low cutoff frequency for mel bins
+--high-freq=-1000 # high cutoff frequently, relative to Nyquist of 4000 (=3000)
+
diff --git a/egs/aspire/s5/conf/mfcc_vad.conf b/egs/aspire/s5/conf/mfcc_vad.conf
@@ -0,0 +1,5 @@
+--sample-frequency=8000 
+--frame-length=25 # the default is 25.
+--low-freq=20 # the default.
+--high-freq=-300 # the default is zero meaning use the Nyquist (4k in this case).
+--num-ceps=13 # higher than the default which is 12.
diff --git a/egs/aspire/s5/conf/segmentation.conf b/egs/aspire/s5/conf/segmentation.conf
@@ -0,0 +1,24 @@
+method=Viterbi
+
+# General segmentation options
+max_intersegment_length=50  # Merge nearby speech segments if the silence
+                            # between them is less than this many frames.
+max_relabel_length=10  # maximum duration of speech that will be removed as part
+                       # of smoothing process. This is only if there are no other
+                       # speech segments nearby.
+pad_length=5         # Pad speech segments by this many frames on either side
+max_segment_length=1000   # Segments that are longer than this are split into
+                          # overlapping frames.
+overlap_length=100        # Overlapping frames when segments are split.
+                          # See the above option.
+
+# Viterbi options
+min_silence_duration=30   # minimum number of frames for silence
+min_speech_duration=30    # minimum number of frames for speech
+speech_to_sil_ratio=1     # the prior on speech vs silence
+
+# Decoding options
+acwt=1
+beam=10
+max_active=7000
+
diff --git a/egs/aspire/s5/conf/segmentation_aspire.conf b/egs/aspire/s5/conf/segmentation_aspire.conf
@@ -0,0 +1,25 @@
+method=Viterbi
+
+# General segmentation options
+max_intersegment_length=50  # Merge nearby speech segments if the silence
+                            # between them is less than this many frames.
+max_relabel_length=10  # maximum duration of speech that will be removed as part
+                       # of smoothing process. This is only if there are no other
+                       # speech segments nearby.
+pad_length=5         # Pad speech segments by this many frames on either side
+max_segment_length=1000   # Segments that are longer than this are split into
+                          # overlapping frames.
+overlap_length=100        # Overlapping frames when segments are split.
+                          # See the above option.
+
+# Viterbi options
+min_silence_duration=30   # minimum number of frames for silence
+min_speech_duration=30    # minimum number of frames for speech
+speech_to_sil_ratio=1     # the prior on speech vs silence
+
+# Decoding options
+acwt=1
+beam=10
+max_active=7000
+
+
diff --git a/egs/aspire/s5/conf/segmentation_babel.conf b/egs/aspire/s5/conf/segmentation_babel.conf
@@ -0,0 +1,26 @@
+method=Viterbi
+
+# General segmentation options
+max_intersegment_length=100  # Merge nearby speech segments if the silence
+                            # between them is less than this many frames.
+max_relabel_length=10  # maximum duration of speech that will be removed as part
+                       # of smoothing process. This is only if there are no other
+                       # speech segments nearby.
+pad_length=10            # Pad speech segments by this many frames on either side
+post_pad_length=10           # Pad speech segments by this many frames on either side
+max_segment_length=1000   # Segments that are longer than this are split into
+                          # overlapping frames.
+overlap_length=100         # Overlapping frames when segments are split.
+                          # See the above option.
+
+# Viterbi options
+min_silence_duration=30   # minimum number of frames for silence
+min_speech_duration=30    # minimum number of frames for speech
+speech_to_sil_ratio=1     # the prior on speech vs silence
+
+# Decoding options
+acwt=1
+beam=10
+max_active=7000
+
+
diff --git a/egs/aspire/s5/conf/vad_icsi_babel.conf b/egs/aspire/s5/conf/vad_icsi_babel.conf
@@ -0,0 +1,39 @@
+## Features paramters
+window_size=10                   # 100 ms
+frames_per_gaussian=200
+
+## Phase 1 parameters
+num_frames_init_silence=2000      # 20s - Lowest energy frames selected to initialize Silence GMM
+num_frames_init_sound=10000       # 100s - Highest energy frames selected to initialize Sound GMM
+num_frames_init_sound_next=2000   # 20s - Highest zero crossing frames selected to initialize Sound GMM
+sil_num_gauss_init=2
+sound_num_gauss_init=2
+sil_max_gauss=2
+sound_max_gauss=6
+sil_gauss_incr=0
+sound_gauss_incr=2
+num_iters=5
+min_sil_variance=0.1
+min_sound_variance=0.01
+min_speech_variance=0.001
+
+## Phase 2 parameters
+speech_num_gauss_init=6
+sil_max_gauss_phase2=7
+sound_max_gauss_phase2=18
+speech_max_gauss_phase2=16
+sil_gauss_incr_phase2=1
+sound_gauss_incr_phase2=2
+speech_gauss_incr_phase2=2
+num_iters_phase2=5
+
+## Phase 3 parameters
+sil_num_gauss_init_phase3=2
+speech_num_gauss_init_phase3=2
+sil_max_gauss_phase3=5
+speech_max_gauss_phase3=12
+sil_gauss_incr_phase3=1
+speech_gauss_incr_phase3=2
+num_iters_phase3=7
+
+
diff --git a/egs/aspire/s5/conf/vad_icsi_babel_3models.conf b/egs/aspire/s5/conf/vad_icsi_babel_3models.conf
@@ -0,0 +1,54 @@
+## Features paramters
+window_size=10                   # 100 ms
+frames_per_gaussian=200
+
+## Phase 1 parameters
+num_frames_init_silence=2000      # 20s - Lowest energy frames selected to initialize Silence GMM
+num_frames_init_sound=10000       # 100s - Highest energy frames selected to initialize Sound GMM
+num_frames_init_sound_next=2000   # 20s - Highest zero crossing frames selected to initialize Sound GMM
+sil_num_gauss_init=2
+sound_num_gauss_init=2
+sil_max_gauss=2
+sound_max_gauss=6
+sil_gauss_incr=0
+sound_gauss_incr=2
+num_iters=5
+min_sil_variance=0.1
+min_sound_variance=0.01
+min_speech_variance=0.001
+
+## Phase 2 parameters
+speech_num_gauss_init=6
+sil_max_gauss_phase2=7
+sound_max_gauss_phase2=18
+speech_max_gauss_phase2=16
+sil_gauss_incr_phase2=1
+sound_gauss_incr_phase2=2
+speech_gauss_incr_phase2=2
+num_iters_phase2=5
+
+## Phase 3 parameters
+num_frames_silence_phase3_init=2000
+num_frames_speech_phase3_init=2000
+sil_num_gauss_init_phase3=2
+speech_num_gauss_init_phase3=2
+sil_max_gauss_phase3=5
+sil_max_gauss_phase4=8
+speech_max_gauss_phase4=16
+sil_gauss_incr_phase3=1
+sil_gauss_incr_phase4=1
+speech_gauss_incr_phase4=2
+num_iters_phase3=5
+num_iters_phase4=5
+
+## Phase 5 parameters
+sil_num_gauss_init_phase5=2
+speech_num_gauss_init_phase5=2
+sil_max_gauss_phase5=5
+speech_max_gauss_phase5=12
+sil_gauss_incr_phase5=1
+speech_gauss_incr_phase5=2
+num_iters_phase5=7
+
+
+
diff --git a/egs/aspire/s5/conf/vad_icsi_rt.conf b/egs/aspire/s5/conf/vad_icsi_rt.conf
@@ -0,0 +1,41 @@
+## Features paramters
+window_size=10                   # 1s
+frames_per_gaussian=2000
+
+## Phase 1 parameters
+num_frames_init_silence=2000
+num_frames_init_sound=10000       
+num_frames_init_sound_next=2000   
+sil_num_gauss_init=2
+sound_num_gauss_init=2
+sil_max_gauss=2
+sound_max_gauss=6
+sil_gauss_incr=0
+sound_gauss_incr=2
+num_iters=5
+min_sil_variance=0.1
+min_sound_variance=0.01
+min_speech_variance=0.001
+
+## Phase 2 parameters
+num_frames_init_speech=10000
+speech_num_gauss_init=6
+sil_max_gauss_phase2=7
+sound_max_gauss_phase2=18
+speech_max_gauss_phase2=16
+sil_gauss_incr_phase2=1
+sound_gauss_incr_phase2=2
+speech_gauss_incr_phase2=2
+num_iters_phase2=5
+
+## Phase 3 parameters
+sil_num_gauss_init_phase3=2
+speech_num_gauss_init_phase3=2
+sil_max_gauss_phase3=5
+speech_max_gauss_phase3=12
+sil_gauss_incr_phase3=1
+speech_gauss_incr_phase3=2
+num_iters_phase3=7
+
+
+
diff --git a/egs/aspire/s5/conf/weights_segmentation_aspire.conf b/egs/aspire/s5/conf/weights_segmentation_aspire.conf
@@ -0,0 +1,26 @@
+method=Viterbi
+
+# General segmentation options
+max_intersegment_length=0  # Merge nearby speech segments if the silence
+                            # between them is less than this many frames.
+max_relabel_length=0  # maximum duration of speech that will be removed as part
+                       # of smoothing process. This is only if there are no other
+                       # speech segments nearby.
+pad_length=0         # Pad speech segments by this many frames on either side
+max_segment_length=2000   # Segments that are longer than this are split into
+                          # overlapping frames.
+overlap_length=0        # Overlapping frames when segments are split.
+                        # See the above option.
+
+# Viterbi options
+min_silence_duration=30   # minimum number of frames for silence
+min_speech_duration=30    # minimum number of frames for speech
+speech_to_sil_ratio=0.1   # the prior on speech vs silence
+
+# Decoding options
+acwt=1
+beam=10
+max_active=7000
+
+
+
diff --git a/egs/aspire/s5/conf/weights_segmentation_babel.conf b/egs/aspire/s5/conf/weights_segmentation_babel.conf
@@ -0,0 +1,25 @@
+method=Viterbi
+
+# General segmentation options
+max_intersegment_length=0  # Merge nearby speech segments if the silence
+                            # between them is less than this many frames.
+max_relabel_length=0  # maximum duration of speech that will be removed as part
+                       # of smoothing process. This is only if there are no other
+                       # speech segments nearby.
+pad_length=0         # Pad speech segments by this many frames on either side
+max_segment_length=2000   # Segments that are longer than this are split into
+                          # overlapping frames.
+overlap_length=0        # Overlapping frames when segments are split.
+                        # See the above option.
+
+# Viterbi options
+min_silence_duration=30   # minimum number of frames for silence
+min_speech_duration=30    # minimum number of frames for speech
+speech_to_sil_ratio=0.1   # the prior on speech vs silence
+
+# Decoding options
+acwt=1
+beam=10
+max_active=7000
+
+
diff --git a/egs/aspire/s5/conf/zc_vad.conf b/egs/aspire/s5/conf/zc_vad.conf
@@ -0,0 +1,4 @@
+--sample-frequency=8000 
+--frame-length=25 # the default is 25.
+--dither=0.0
+--zero-crossing-threshold=1e-5
diff --git a/egs/aspire/s5/diarization b/egs/aspire/s5/diarization
@@ -0,0 +1 @@
+../../sre08/v1/diarization
diff --git a/egs/aspire/s5/local/multi_condition/combine_ali_dirs.sh b/egs/aspire/s5/local/multi_condition/combine_ali_dirs.sh
@@ -6,6 +6,7 @@
 # Begin configuration section. 
 extra_files= # specify addtional files in 'src-data-dir' to merge, ex. "file1 file2 ..."
 ref_data_dir= # data directory to be used as reference for rearranging alignments
+cmd=run.pl
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -78,7 +79,7 @@ if [ ! -z "$ref_data_dir" ]; then
     awk -v p=\$ali_file '{printf "%s %s %s\n", \$1, p, NR}' > $temp_dir/ali_utt_index.\$JOB
 EOF
   chmod +x $temp_dir/create_ali_utt_index.sh
-  $decode_cmd -v PATH JOB=1:$num_jobs $temp_dir/ali_copy_int.JOB.log $temp_dir/create_ali_utt_index.sh JOB
+  $cmd -v PATH JOB=1:$num_jobs $temp_dir/ali_copy_int.JOB.log $temp_dir/create_ali_utt_index.sh JOB
 
   cat <<EOF >$temp_dir/create_new_ali.py
 
@@ -147,7 +148,7 @@ EOF
   # split the ref_data_dir to get reference utt2spk for individual ali.JOB.gz files
   utils/split_data.sh $ref_data_dir $num_jobs
 
-  $decode_cmd -v PATH JOB=1:$num_jobs $temp_dir/create_new_ali.JOB.run.log \
+  $cmd JOB=1:$num_jobs $temp_dir/create_new_ali.JOB.run.log \
     python $temp_dir/create_new_ali.py \
       $ref_data_dir/split$num_jobs/JOB/utt2spk \
       $temp_dir/create_new_ali.JOB.sh $temp_dir/ali.JOB.gz || exit 1;

diff --git a/egs/aspire/s5/local/multi_condition/copy_ali_dir.sh b/egs/aspire/s5/local/multi_condition/copy_ali_dir.sh
@@ -18,6 +18,7 @@
 # begin configuration section
 utt_prefix=
 utt_suffix=
+cmd=run.pl
 # end configuration section
 
 . utils/parse_options.sh
@@ -72,6 +73,6 @@ for line in sys.stdin:
 set +o pipefail; # unset the pipefail option.
 EOF
 chmod +x $dest_dir/temp/copy_ali.sh
-$decode_cmd -v PATH JOB=1:$nj $dest_dir/temp/copy_ali.JOB.log $dest_dir/temp/copy_ali.sh JOB || exit 1;
+$cmd -v PATH JOB=1:$nj $dest_dir/temp/copy_ali.JOB.log $dest_dir/temp/copy_ali.sh JOB || exit 1;
 
 echo "$0: copied alignments from $src_dir to $dest_dir"