vimalmanohar · vimalmanohar · Dec 3, 2016 · Dec 3, 2016 · Nov 23, 2016 · Nov 23, 2016
diff --git a/egs/ami/s5b/local/prepare_parallel_train_data.sh b/egs/ami/s5b/local/prepare_parallel_train_data.sh
@@ -5,6 +5,10 @@
 # but the wav data is copied from data/ihm.  This is a little tricky because the
 # utterance ids are different between the different mics
 
+train_set=train
+
+. utils/parse_options.sh
+
 
 if [ $# != 1 ]; then
   echo "Usage: $0 [sdm1|mdm8]"
@@ -18,12 +22,10 @@ if [ $mic == "ihm" ]; then
   exit 1;
 fi
 
-train_set=train
-
 . cmd.sh
 . ./path.sh
 
-for f in data/ihm/train/utt2spk data/$mic/train/utt2spk; do
+for f in data/ihm/${train_set}/utt2spk data/$mic/${train_set}/utt2spk; do
   if [ ! -f $f ]; then
     echo "$0: expected file $f to exist"
     exit 1
@@ -32,12 +34,12 @@ done
 
 set -e -o pipefail
 
-mkdir -p data/$mic/train_ihmdata
+mkdir -p data/$mic/${train_set}_ihmdata
 
 # the utterance-ids and speaker ids will be from the SDM or MDM data
-cp data/$mic/train/{spk2utt,text,utt2spk} data/$mic/train_ihmdata/
+cp data/$mic/${train_set}/{spk2utt,text,utt2spk} data/$mic/${train_set}_ihmdata/
 # the recording-ids will be from the IHM data.
-cp data/ihm/train/{wav.scp,reco2file_and_channel} data/$mic/train_ihmdata/
+cp data/ihm/${train_set}/{wav.scp,reco2file_and_channel} data/$mic/${train_set}_ihmdata/
 
 # map sdm/mdm segments to the ihm segments
 
@@ -47,19 +49,17 @@ mic_base_upcase=$(echo $mic | sed 's/[0-9]//g' | tr 'a-z' 'A-Z')
 # It has lines like:
 # AMI_EN2001a_H02_FEO065_0021133_0021442 AMI_EN2001a_SDM_FEO065_0021133_0021442
 
-tmpdir=data/$mic/train_ihmdata/
+tmpdir=data/$mic/${train_set}_ihmdata/
 
-awk '{print $1, $1}' <data/ihm/train/utt2spk | \
+awk '{print $1, $1}' <data/ihm/${train_set}/utt2spk | \
   sed -e "s/_H[0-9][0-9]_/_${mic_base_upcase}_/" | \
   awk '{print $2, $1}' >$tmpdir/ihmutt2utt
 
 # Map the 1st field of the segments file from the ihm data (the 1st field being
 # the utterance-id) to the corresponding SDM or MDM utterance-id.  The other
 # fields remain the same (e.g. we want the recording-ids from the IHM data).
-utils/apply_map.pl -f 1 $tmpdir/ihmutt2utt <data/ihm/train/segments >data/$mic/train_ihmdata/segments
-
-utils/fix_data_dir.sh data/$mic/train_ihmdata
+utils/apply_map.pl -f 1 $tmpdir/ihmutt2utt <data/ihm/${train_set}/segments >data/$mic/${train_set}_ihmdata/segments
 
-rm $tmpdir/ihmutt2utt
+utils/fix_data_dir.sh data/$mic/${train_set}_ihmdata
 
 exit 0;
diff --git a/egs/aspire/s5/conf/mfcc_hires_bp.conf b/egs/aspire/s5/conf/mfcc_hires_bp.conf
@@ -0,0 +1,13 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=28
+--num-ceps=28
+--cepstral-lifter=0
+--low-freq=330    # low cutoff frequency for mel bins
+--high-freq=-1000 # high cutoff frequently, relative to Nyquist of 4000 (=3000)
+
+
diff --git a/egs/aspire/s5/local/segmentation/do_corruption_data_dir.sh b/egs/aspire/s5/local/segmentation/do_corruption_data_dir.sh
@@ -0,0 +1,140 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0
+
+set -e
+set -u
+set -o pipefail
+
+. path.sh
+
+stage=0
+corruption_stage=-10
+corrupt_only=false
+
+# Data options
+data_dir=data/train_si284   # Expecting whole data directory.
+speed_perturb=true
+num_data_reps=5   # Number of corrupted versions
+snrs="20:10:15:5:0:-5"
+foreground_snrs="20:10:15:5:0:-5"
+background_snrs="20:10:15:5:0:-5"
+base_rirs=simulated
+
+# Parallel options
+reco_nj=40  
+cmd=queue.pl
+
+# Options for feature extraction
+mfcc_config=conf/mfcc_hires_bp_vh.conf
+feat_suffix=hires_bp_vh
+
+reco_vad_dir=   # Output of prepare_unsad_data.sh. 
+                # If provided, the speech labels and deriv weights will be 
+                # copied into the output data directory.
+
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+  echo "Usage: $0"
+  exit 1
+fi
+
+data_id=`basename ${data_dir}`
+
+rvb_opts=()
+if [ "$base_rirs" == "simulated" ]; then
+  # This is the config for the system using simulated RIRs and point-source noises
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+  rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list)
+else
+  # This is the config for the JHU ASpIRE submission system
+  rvb_opts+=(--rir-set-parameters "1.0, RIRS_NOISES/real_rirs_isotropic_noises/rir_list")
+  rvb_opts+=(--noise-set-parameters RIRS_NOISES/real_rirs_isotropic_noises/noise_list)
+fi
+
+corrupted_data_id=${data_id}_corrupted
+
+if [ $stage -le 1 ]; then
+  python steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --prefix="rev" \
+    --foreground-snrs=$foreground_snrs \
+    --background-snrs=$background_snrs \
+    --speech-rvb-probability=1 \
+    --pointsource-noise-addition-probability=1 \
+    --isotropic-noise-addition-probability=1 \
+    --num-replications=$num_data_reps \
+    --max-noises-per-minute=1 \
+    data/${data_id} data/${corrupted_data_id}
+fi
+
+corrupted_data_dir=data/${corrupted_data_id}
+
+if $speed_perturb; then
+  if [ $stage -le 2 ]; then
+    ## Assuming whole data directories
+    for x in $clean_data_dir $corrupted_data_dir $noise_data_dir; do
+      cp $x/reco2dur $x/utt2dur
+      utils/data/perturb_data_dir_speed_3way.sh $x ${x}_sp
+    done
+  fi
+
+  corrupted_data_dir=${corrupted_data_dir}_sp
+  corrupted_data_id=${corrupted_data_id}_sp
+
+  if [ $stage -le 3 ]; then
+    utils/data/perturb_data_dir_volume.sh --scale-low 0.03125 --scale-high 2 \
+      ${corrupted_data_dir}
+  fi
+fi
+
+if $corrupt_only; then
+  echo "$0: Got corrupted data directory in ${corrupted_data_dir}"
+  exit 0
+fi
+
+mfccdir=`basename $mfcc_config`
+mfccdir=${mfccdir%%.conf}
+
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+  utils/create_split_dir.pl \
+    /export/b0{3,4,5,6}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+fi
+
+if [ $stage -le 4 ]; then
+  if [ ! -z $feat_suffix ]; then
+    utils/copy_data_dir.sh $corrupted_data_dir ${corrupted_data_dir}_$feat_suffix
+    corrupted_data_dir=${corrupted_data_dir}_$feat_suffix
+  fi
+  steps/make_mfcc.sh --mfcc-config $mfcc_config \
+    --cmd "$cmd" --nj $reco_nj \
+    $corrupted_data_dir exp/make_${mfccdir}/${corrupted_data_id} $mfccdir
+  steps/compute_cmvn_stats.sh --fake \
+    $corrupted_data_dir exp/make_${mfccdir}/${corrupted_data_id} $mfccdir
+else
+  if [ ! -z $feat_suffix ]; then
+    corrupted_data_dir=${corrupted_data_dir}_$feat_suffix
+  fi
+fi 
+
+if [ $stage -le 8 ]; then
+  if [ ! -z "$reco_vad_dir" ]; then
+    if [ ! -f $reco_vad_dir/speech_feat.scp ]; then
+      echo "$0: Could not find file $reco_vad_dir/speech_feat.scp"
+      exit 1
+    fi
+
+    cat $reco_vad_dir/speech_feat.scp | \
+      steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps | \
+      sort -k1,1 > ${corrupted_data_dir}/speech_feat.scp
+
+    cat $reco_vad_dir/deriv_weights.scp | \
+      steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps | \
+      sort -k1,1 > ${corrupted_data_dir}/deriv_weights.scp
+  fi
+fi
+
+exit 0