From c8deb5f5dfa181801b3156268a8cd184b677c534 Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Wed, 28 Oct 2015 05:12:07 -0400 Subject: [PATCH] adding files related the simulated RIR experiment to the Aspire folder --- .../s5/local/multi_condition/rirs/IR_stats.m | 305 ++++++++++++ .../s5/local/multi_condition/rirs/octdsgn.m | 45 ++ .../multi_condition/rirs/prep_sim_rirs.sh | 109 +++++ .../multi_condition/rirs/rir_generator.cpp | 459 ++++++++++++++++++ .../local/multi_condition/run_nnet2_common.sh | 1 - .../multi_condition/run_nnet2_common_clean.sh | 91 ++++ .../multi_condition/run_nnet2_common_sim.sh | 141 ++++++ .../s5/local/multi_condition/run_nnet2_ms.sh | 3 +- .../multi_condition/run_nnet2_ms_clean.sh | 108 +++++ .../local/multi_condition/run_nnet2_ms_sim.sh | 108 +++++ .../multi_condition/run_nnet3_ms_clean.sh | 93 ++++ 11 files changed, 1460 insertions(+), 3 deletions(-) create mode 100644 egs/aspire/s5/local/multi_condition/rirs/IR_stats.m create mode 100644 egs/aspire/s5/local/multi_condition/rirs/octdsgn.m create mode 100755 egs/aspire/s5/local/multi_condition/rirs/prep_sim_rirs.sh create mode 100644 egs/aspire/s5/local/multi_condition/rirs/rir_generator.cpp create mode 100755 egs/aspire/s5/local/multi_condition/run_nnet2_common_clean.sh create mode 100755 egs/aspire/s5/local/multi_condition/run_nnet2_common_sim.sh create mode 100755 egs/aspire/s5/local/multi_condition/run_nnet2_ms_clean.sh create mode 100755 egs/aspire/s5/local/multi_condition/run_nnet2_ms_sim.sh create mode 100755 egs/aspire/s5/local/multi_condition/run_nnet3_ms_clean.sh diff --git a/egs/aspire/s5/local/multi_condition/rirs/IR_stats.m b/egs/aspire/s5/local/multi_condition/rirs/IR_stats.m new file mode 100644 index 00000000000..0eeac09ee46 --- /dev/null +++ b/egs/aspire/s5/local/multi_condition/rirs/IR_stats.m @@ -0,0 +1,305 @@ +function [rt,drr,cte,cfs,edt] = IR_stats(x, fs, varargin) +%function [rt,drr,cte,cfs,edt] = IR_stats(filename, varargin) +%function [rt,drr,cte,cfs,edt] = IR_stats(x, fs, varargin) +% Calculate RT, DRR, Cte, and EDT for impulse response file +% +% RT = IR_STATS(FILENAME) returns the reverberation time (to -60 dB) +% using a method based on ISO 3382-1:2009. The function uses reverse +% cumulative trapezoidal integration to estimate the decay curve, and a +% linear least-square fit to estimate the slope between 0 dB and -60 dB. +% Estimates are taken in octave bands and the overall figure is an +% average of the 500 Hz and 1 kHz bands. +% +% FILENAME should be the full path to an audio file or the name of an +% audio file on the Matlab search path. The file can be of any format +% supported by the AUDIOREAD function, and have any number of channels; +% estimates (and plots) will be returned for each channel. +% +% The function returns a 1xN vector of RTs, where N is the number of +% channels in the audio file. +% +% The function determines the direct sound as the peak of the squared +% impulse response. +% +% [RT,DRR] = IR_STATS(FILENAME) returns the direct-to-reverberant-ratio +% DRR for the impulse; DRR is the same size as RT. This is calculated +% in the following way: +% +% DRR = 10 * log10( X(T0-C:T0+C)^2 / X(T0+C+1:end)^2 ) +% +% where X is the approximated integral of the impulse, T0 is the time of +% the direct impulse, and C=2.5ms [1]. +% +% [RT,DRR,CTE] = IR_STATS(FILENAME) returns the early-to-late index CTE +% for the impulse; CTE is the same size as RT. This is calculated in +% the following way: +% +% CTE = 10 * log10( X(T0-C:T0+TE)^2 / X(T0+TE+1:end)^2 ) +% +% where TE is 50 ms. +% +% [RT,DRR,CTE,CFS] = IR_STATS(FILENAME) returns the octave-band centre +% frequencies CFS used in the calculation of RT. +% +% [RT,DRR,CTE,CFS,EDT] = IR_STATS(FILENAME) returns the early decay +% time EDT, which is the same size as RT. The slope of the decay curve +% is determined from the fit between 0 and -10 dB. The decay time is +% calculated from the slope as the time required for a 60 dB decay. +% +% ... = IR_STATS(...,'PARAMETER',VALUE) allows numerous +% parameters to be specified. These parameters are: +% +% 'graph' : {false} | true +% Controls whether decay curves are plotted. Specifically, graphs +% are plotted of the impulse response, decay curves, and linear +% least-square fit for each octave band and channel of the audio +% file. If the EDT output is specified, the EDT fit will also be +% plotted. +% 'te' : {0.05} | scalar +% Specifies the early time limit (in seconds). +% 'spec' : {'mean'} | 'full' +% Determines the nature of RT and EDT outputs. With spec='mean' +% (default) the reported RT and EDT are the mean of the 500 Hz +% and 1 kHz bands. With spec='full', the function returns the +% RT and EDT as calculated for each octave band returned in +% CFS; RT and EDT have size [M N] where M=length(CFS). +% 'y_fit' : {[0 60]} | two-element vector +% Specifies the decibel range over which the decay curve should +% be evaluated. For example, 'y_fit' may be [-5 -25] or [-5 -35] +% corresponding to the RT20 and RT30 respectively. +% 'correction' : {0.0025} | scalar +% Specifies the correction parameter C (in seconds) given above +% for DRR and CTE calculations. Values of up to 10 ms have been +% suggested in the literature. +% +% Octave-band filters are calculated according to ANSI S1.1-1986 and IEC +% standards. Note that the OCTDSGN function recommends centre frequencies +% fc in the range fs/200 < fc < fs/5. +% +% The author would like to thank Feifei Xiong for his input on the +% correction parameter. +% +% References +% +% [1] Zahorik, P., 2002: 'Direct-to-reverberant energy ratio +% sensitivity', The Journal of the Acoustical Society of America, +% 112, 2110-2117. +% +% See also AUDIOREAD, OCTDSGN. + +% ========================================================================= +% Last changed: $Date: 2015-06-19 12:05:24 +0100 (Fri, 19 Jun 2015) $ +% Last committed: $Revision: 387 $ +% Last changed by: $Author: ch0022 $ +% ========================================================================= + + %% validate inputs and set options + + % check file exists + %assert(exist(filename,'file')==2,['IR_stats: ' filename ' does not exist']) + + % set defaults + options = struct(... + 'graph',false,... + 'te',0.05,... + 'spec','mean',... + 'y_fit',[0 -60],... + 'correction',0.0025); + + % read parameter/value inputs + if nargin>1 % if parameters are specified + % read the acceptable names + optionNames = fieldnames(options); + % count arguments + nArgs = length(varargin); + if round(nArgs/2)~=nArgs/2 + error('IR_STATS needs propertyName/propertyValue pairs') + end + % overwrite defults + for pair = reshape(varargin,2,[]) % pair is {propName;propValue} + IX = strcmpi(pair{1},optionNames); % find match parameter names + if any(IX) + % do the overwrite + options.(optionNames{IX}) = pair{2}; + else + error('%s is not a recognized parameter name',pair{1}) + end + end + end + + %% read in audio file + + % read in impulse + %[x,fs] = audioread(filename); + assert(fs>=5000,'Sampling frequency is too low. FS must be at least 5000 Hz.') + + % set te in samples + te = round(options.te*fs); + + % Check sanity of te + assert(tefs/200 & cfs=2 + drr(n) = 10.*log(... + trapz(x(max(1,t0(n)-correction):t0(n)+correction,n).^2)/... + trapz(x(t0(n)+correction+1:end,n).^2)... + ); + end + % Cte + if nargout>=3 + if t0(n)+te+1>size(x,1) + warning(['Early time limit (te) out of range in channel ' num2str(n) '. Try lowering te.']) + cte(n) = NaN; + else + cte(n) = 10.*log(... + trapz(x(max(1,t0(n)-correction):t0(n)+te).^2)/... + trapz(x(t0(n)+te+1:end,n).^2)... + ); + end + end + end + + %% write output + + switch lower(options.spec) + case 'full' + rt = rt_temp; + case 'mean' + rt = mean(rt_temp(cfs==500 | cfs==1000,:)); % overall RT + edt = mean(edt(cfs==500 | cfs==1000,:)); % overall EDT + otherwise + error('Unknown ''spec'': must be ''full'' or ''mean''.') + end + +end + +function [t,E,fit] = calc_decay(z,y_fit,y_dec,fs) +% CALC_DECAY calculate decay time from decay curve +% Returns the time for a specified decay y_dec calculated +% from the fit over the range y_fit. The input is the +% integral of the impulse sample at fs Hz. The function also +% returns the energy decay curve in dB and the corresponding +% fit. + + E = 10.*log10(z); % put into dB + E = E-max(E); % normalise to max 0 + E = E(1:find(isinf(E),1,'first')-1); % remove trailing infinite values + IX = find(E<=max(y_fit),1,'first'):find(E<=min(y_fit),1,'first'); % find yfit x-range + if isempty(IX) + error('Impulse response has insufficient dynamic range to evaluate to %i dB',min(y_fit)) + end + + % calculate fit over yfit + x = reshape(IX,1,length(IX)); + y = reshape(E(IX),1,length(IX)); + p = polyfit(x,y,1); + fit = polyval(p,1:2*length(E)); % actual fit + fit2 = fit-max(fit); % fit anchored to 0dB + + diff_y = abs(diff(y_fit)); % dB range diff + t = (y_dec/diff_y)*find(fit2<=-diff_y,1,'first')/fs; % estimate decay time + + fit = fit(1:length(E)); + +end + diff --git a/egs/aspire/s5/local/multi_condition/rirs/octdsgn.m b/egs/aspire/s5/local/multi_condition/rirs/octdsgn.m new file mode 100644 index 00000000000..21a323a1bf5 --- /dev/null +++ b/egs/aspire/s5/local/multi_condition/rirs/octdsgn.m @@ -0,0 +1,45 @@ +function [B,A] = octdsgn(Fc,Fs,N); +% OCTDSGN Design of an octave filter. +% [B,A] = OCTDSGN(Fc,Fs,N) designs a digital octave filter with +% center frequency Fc for sampling frequency Fs. +% The filter are designed according to the Order-N specification +% of the ANSI S1.1-1986 standard. Default value for N is 3. +% Warning: for meaningful design results, center values used +% should preferably be in range Fs/200 < Fc < Fs/5. +% Usage of the filter: Y = FILTER(B,A,X). +% +% Requires the Signal Processing Toolbox. +% +% See also OCTSPEC, OCT3DSGN, OCT3SPEC. + +% Author: Christophe Couvreur, Faculte Polytechnique de Mons (Belgium) +% couvreur@thor.fpms.ac.be +% Last modification: Aug. 22, 1997, 9:00pm. + +% References: +% [1] ANSI S1.1-1986 (ASA 65-1986): Specifications for +% Octave-Band and Fractional-Octave-Band Analog and +% Digital Filters, 1993. + +if (nargin > 3) | (nargin < 2) + error('Invalide number of arguments.'); +end +if (nargin == 2) + N = 3; +end +if (Fc > 0.70*(Fs/2)) + error('Design not possible. Check frequencies.'); +end + +% Design Butterworth 2Nth-order octave filter +% Note: BUTTER is based on a bilinear transformation, as suggested in [1]. +%W1 = Fc/(Fs/2)*sqrt(1/2); +%W2 = Fc/(Fs/2)*sqrt(2); +pi = 3.14159265358979; +beta = pi/2/N/sin(pi/2/N); +alpha = (1+sqrt(1+8*beta^2))/4/beta; +W1 = Fc/(Fs/2)*sqrt(1/2)/alpha; +W2 = Fc/(Fs/2)*sqrt(2)*alpha; +[B,A] = butter(N,[W1,W2]); + + diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_sim_rirs.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_sim_rirs.sh new file mode 100755 index 00000000000..1158a770e4a --- /dev/null +++ b/egs/aspire/s5/local/multi_condition/rirs/prep_sim_rirs.sh @@ -0,0 +1,109 @@ +#!/bin/bash +# Copyright 2015 Johns Hopkins University (author: Vijayaditya Peddinti) +# Apache 2.0 +# This script downloads the impulse responses and noise files from the +# Reverb2014 challenge +# and converts them to wav files with the required sampling rate +#============================================== + +download=true +sampling_rate=8000 +output_bit=16 +DBname=SIM_RIRS +RIR_total=75000 +file_splitter= #script to generate job scripts given the command file + +. cmd.sh +. path.sh +. ./utils/parse_options.sh + +if [ $# != 3 ]; then + echo "Usage: " + echo " $0 [options] " + echo "e.g.:" + echo " $0 --download true db/RIR_databases/ data/impulses_noises exp/make_reverb/log" + exit 1; +fi + +RIR_home=$1 +output_dir=$2 +log_dir=$3 + +cat >$log_dir/genrir.m < $log_dir/${DBname}_type${type_num}.rir.list +echo "Found $total_files simulated impulse responses in $output_dir." +for data_file in ${data_files[@]}; do + output_file_name=${DBname}_type${type_num}_`basename $data_file | tr '[:upper:]' '[:lower:]'` + mv $data_file ${output_dir}/${output_file_name} + echo ${output_dir}/${output_file_name} >> $log_dir/${DBname}_type${type_num}.rir.list + files_done=$((files_done + 1)) +done + diff --git a/egs/aspire/s5/local/multi_condition/rirs/rir_generator.cpp b/egs/aspire/s5/local/multi_condition/rirs/rir_generator.cpp new file mode 100644 index 00000000000..c09a8947257 --- /dev/null +++ b/egs/aspire/s5/local/multi_condition/rirs/rir_generator.cpp @@ -0,0 +1,459 @@ +/* +Program : Room Impulse Response Generator + +Description : Computes the response of an acoustic source to one or more + microphones in a reverberant room using the image method [1,2]. + + [1] J.B. Allen and D.A. Berkley, + Image method for efficiently simulating small-room acoustics, + Journal Acoustic Society of America, 65(4), April 1979, p 943. + + [2] P.M. Peterson, + Simulating the response of multiple microphones to a single + acoustic source in a reverberant room, Journal Acoustic + Society of America, 80(5), November 1986. + +Author : dr.ir. E.A.P. Habets (ehabets@dereverberation.org) + +Version : 2.0.20100920 + +History : 1.0.20030606 Initial version + 1.1.20040803 + Microphone directivity + + Improved phase accuracy [2] + 1.2.20040312 + Reflection order + 1.3.20050930 + Reverberation Time + 1.4.20051114 + Supports multi-channels + 1.5.20051116 + High-pass filter [1] + + Microphone directivity control + 1.6.20060327 + Minor improvements + 1.7.20060531 + Minor improvements + 1.8.20080713 + Minor improvements + 1.9.20090822 + 3D microphone directivity control + 2.0.20100920 + Calculation of the source-image position + changed in the code and tutorial. + This ensures a proper response to reflections + in case a directional microphone is used. + +Copyright (C) 2003-2010 E.A.P. Habets, The Netherlands. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#define _USE_MATH_DEFINES + +#include "matrix.h" +#include "mex.h" +#include "math.h" + +#define ROUND(x) ((x)>=0?(long)((x)+0.5):(long)((x)-0.5)) + +#ifndef M_PI + #define M_PI 3.14159265358979323846 +#endif + +double sinc(double x) +{ + if (x == 0) + return(1.); + else + return(sin(x)/x); +} + +double sim_microphone(double x, double y, double z, double* angle, char mtype) +{ + if (mtype=='b' || mtype=='c' || mtype=='s' || mtype=='h') + { + double strength, vartheta, varphi, alpha; + + // Polar Pattern alpha + // --------------------------- + // Bidirectional 0 + // Hypercardioid 0.25 + // Cardioid 0.5 + // Subcardioid 0.75 + // Omnidirectional 1 + + switch(mtype) + { + case 'b': + alpha = 0; + break; + case 'h': + alpha = 0.25; + break; + case 'c': + alpha = 0.5; + break; + case 's': + alpha = 0.75; + break; + }; + + vartheta = acos(z/sqrt(pow(x,2)+pow(y,2)+pow(z,2))); + varphi = atan2(y,x); + + strength = sin(M_PI/2-angle[1]) * sin(vartheta) * cos(angle[0]-varphi) + cos(M_PI/2-angle[1]) * cos(vartheta); + strength = alpha + (1-alpha) * strength; + + return strength; + } + else + { + return 1; + } +} + +void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) +{ + if (nrhs == 0) + { + mexPrintf("--------------------------------------------------------------------\n" + "| Room Impulse Response Generator |\n" + "| |\n" + "| Computes the response of an acoustic source to one or more |\n" + "| microphones in a reverberant room using the image method [1,2]. |\n" + "| |\n" + "| Author : dr.ir. Emanuel Habets (ehabets@dereverberation.org) |\n" + "| |\n" + "| Version : 2.0.20100920 |\n" + "| |\n" + "| Copyright (C) 2003-2010 E.A.P. Habets, The Netherlands. |\n" + "| |\n" + "| [1] J.B. Allen and D.A. Berkley, |\n" + "| Image method for efficiently simulating small-room acoustics,|\n" + "| Journal Acoustic Society of America, |\n" + "| 65(4), April 1979, p 943. |\n" + "| |\n" + "| [2] P.M. Peterson, |\n" + "| Simulating the response of multiple microphones to a single |\n" + "| acoustic source in a reverberant room, Journal Acoustic |\n" + "| Society of America, 80(5), November 1986. |\n" + "--------------------------------------------------------------------\n\n" + "function [h, beta_hat] = rir_generator(c, fs, r, s, L, beta, nsample,\n" + " mtype, order, dim, orientation, hp_filter);\n\n" + "Input parameters:\n" + " c : sound velocity in m/s.\n" + " fs : sampling frequency in Hz.\n" + " r : M x 3 array specifying the (x,y,z) coordinates of the\n" + " receiver(s) in m.\n" + " s : 1 x 3 vector specifying the (x,y,z) coordinates of the\n" + " source in m.\n" + " L : 1 x 3 vector specifying the room dimensions (x,y,z) in m.\n" + " beta : 1 x 6 vector specifying the reflection coefficients\n" + " [beta_x1 beta_x2 beta_y1 beta_y2 beta_z1 beta_z2] or\n" + " beta = Reverberation Time (T_60) in seconds.\n" + " nsample : number of samples to calculate, default is T_60*fs.\n" + " mtype : [omnidirectional, subcardioid, cardioid, hypercardioid,\n" + " bidirectional], default is omnidirectional.\n" + " order : reflection order, default is -1, i.e. maximum order.\n" + " dim : room dimension (2 or 3), default is 3.\n" + " orientation : direction in which the microphones are pointed, specified using\n" + " azimuth and elevation angles (in radians), default is [0 0].\n" + " hp_filter : use 'false' to disable high-pass filter, the high-pass filter\n" + " is enabled by default.\n\n" + "Output parameters:\n" + " h : M x nsample matrix containing the calculated room impulse\n" + " response(s).\n" + " beta_hat : In case a reverberation time is specified as an input parameter\n" + " the corresponding reflection coefficient is returned.\n\n"); + return; + } +// else +// { +// mexPrintf("Room Impulse Response Generator (Version 2.0.20100920) by Emanuel Habets\n" +// "Copyright (C) 2003-2010 E.A.P. Habets, The Netherlands.\n"); +// } + + // Check for proper number of arguments + if (nrhs < 6) + mexErrMsgTxt("Error: There are at least six input parameters required."); + if (nrhs > 12) + mexErrMsgTxt("Error: Too many input arguments."); + if (nlhs > 2) + mexErrMsgTxt("Error: Too many output arguments."); + + // Check for proper arguments + if (!(mxGetN(prhs[0])==1) || !mxIsDouble(prhs[0]) || mxIsComplex(prhs[0])) + mexErrMsgTxt("Invalid input arguments!"); + if (!(mxGetN(prhs[1])==1) || !mxIsDouble(prhs[1]) || mxIsComplex(prhs[1])) + mexErrMsgTxt("Invalid input arguments!"); + if (!(mxGetN(prhs[2])==3) || !mxIsDouble(prhs[2]) || mxIsComplex(prhs[2])) + mexErrMsgTxt("Invalid input arguments!"); + if (!(mxGetN(prhs[3])==3) || !mxIsDouble(prhs[3]) || mxIsComplex(prhs[3])) + mexErrMsgTxt("Invalid input arguments!"); + if (!(mxGetN(prhs[4])==3) || !mxIsDouble(prhs[4]) || mxIsComplex(prhs[4])) + mexErrMsgTxt("Invalid input arguments!"); + if (!(mxGetN(prhs[5])==6 || mxGetN(prhs[5])==1) || !mxIsDouble(prhs[5]) || mxIsComplex(prhs[5])) + mexErrMsgTxt("Invalid input arguments!"); + + // Load parameters + double c = mxGetScalar(prhs[0]); + double fs = mxGetScalar(prhs[1]); + const double* rr = mxGetPr(prhs[2]); + int nr_of_mics = (int) mxGetM(prhs[2]); + const double* ss = mxGetPr(prhs[3]); + const double* LL = mxGetPr(prhs[4]); + const double* beta_ptr = mxGetPr(prhs[5]); + double* beta = new double[6]; + int nsamples; + char* mtype; + int order; + int dim; + double angle[2]; + int hp_filter; + double TR; + + plhs[1] = mxCreateDoubleMatrix(1, 1, mxREAL); + double* beta_hat = mxGetPr(plhs[1]); + beta_hat[0] = 0; + + // Reflection coefficients or Reverberation Time? + if (mxGetN(prhs[5])==1) + { + double V = LL[0]*LL[1]*LL[2]; + double S = 2*(LL[0]*LL[2]+LL[1]*LL[2]+LL[0]*LL[1]); + TR = beta_ptr[0]; + double alfa = 24*V*log(10.0)/(c*S*TR); + if (alfa > 1) + mexErrMsgTxt("Error: The reflection coefficients cannot be calculated using the current " + "room parameters, i.e. room size and reverberation time.\n Please " + "specify the reflection coefficients or change the room parameters."); + beta_hat[0] = sqrt(1-alfa); + for (int i=0;i<6;i++) + beta[i] = beta_hat[0]; + } + else + { + for (int i=0;i<6;i++) + beta[i] = beta_ptr[i]; + } + + // High-pass filter (optional) + if (nrhs > 11 && mxIsEmpty(prhs[11]) == false) + { + hp_filter = (int) mxGetScalar(prhs[11]); + } + else + { + hp_filter = 1; + } + + // 3D Microphone orientation (optional) + if (nrhs > 10 && mxIsEmpty(prhs[10]) == false) + { + const double* orientation = mxGetPr(prhs[10]); + if (mxGetN(prhs[10]) == 1) + { + angle[0] = orientation[0]; + angle[1] = 0; + } + else + { + angle[0] = orientation[0]; + angle[1] = orientation[1]; + } + } + else + { + angle[0] = 0; + angle[1] = 0; + } + + // Room Dimension (optional) + if (nrhs > 9 && mxIsEmpty(prhs[9]) == false) + { + dim = (int) mxGetScalar(prhs[9]); + if (dim != 2 && dim != 3) + mexErrMsgTxt("Invalid input arguments!"); + + if (dim == 2) + { + beta[4] = 0; + beta[5] = 0; + } + } + else + { + dim = 3; + } + + // Reflection order (optional) + if (nrhs > 8 && mxIsEmpty(prhs[8]) == false) + { + order = (int) mxGetScalar(prhs[8]); + if (order < -1) + mexErrMsgTxt("Invalid input arguments!"); + } + else + { + order = -1; + } + + // Type of microphone (optional) + if (nrhs > 7 && mxIsEmpty(prhs[7]) == false) + { + mtype = new char[mxGetN(prhs[7])+1]; + mxGetString(prhs[7], mtype, mxGetN(prhs[7])+1); + } + else + { + mtype = new char[1]; + mtype[0] = 'o'; + } + + // Number of samples (optional) + if (nrhs > 6 && mxIsEmpty(prhs[6]) == false) + { + nsamples = (int) mxGetScalar(prhs[6]); + } + else + { + if (mxGetN(prhs[5])>1) + { + double V = LL[0]*LL[1]*LL[2]; + double S = 2*(LL[0]*LL[2]+LL[1]*LL[2]+LL[0]*LL[1]); + double alpha = ((1-pow(beta[0],2))+(1-pow(beta[1],2)))*LL[0]*LL[2] + + ((1-pow(beta[2],2))+(1-pow(beta[3],2)))*LL[1]*LL[2] + + ((1-pow(beta[4],2))+(1-pow(beta[5],2)))*LL[0]*LL[1]; + TR = 24*log(10.0)*V/(c*alpha); + if (TR < 0.128) + TR = 0.128; + } + nsamples = (int) (TR * fs); + } + + // Create output vector + plhs[0] = mxCreateDoubleMatrix(nr_of_mics, nsamples, mxREAL); + double* imp = mxGetPr(plhs[0]); + + // Temporary variables and constants (high-pass filter) + const double W = 2*M_PI*100/fs; + const double R1 = exp(-W); + const double B1 = 2*R1*cos(W); + const double B2 = -R1 * R1; + const double A1 = -(1+R1); + double X0; + double* Y = new double[3]; + + // Temporary variables and constants (image-method) + const double Fc = 1; + const int Tw = 2 * ROUND(0.004*fs); + const double cTs = c/fs; + double* hanning_window = new double[Tw+1]; + double* LPI = new double[Tw+1]; + double* r = new double[3]; + double* s = new double[3]; + double* L = new double[3]; + double hu[6]; + double refl[3]; + double dist; + double ll; + double strength; + int pos, fdist; + int n1,n2,n3; + int q, j, k; + int mx, my, mz; + int n; + + s[0] = ss[0]/cTs; s[1] = ss[1]/cTs; s[2] = ss[2]/cTs; + L[0] = LL[0]/cTs; L[1] = LL[1]/cTs; L[2] = LL[2]/cTs; + + // Hanning window + for (n = 0 ; n < Tw+1 ; n++) + { + hanning_window[n] = 0.5 * (1 + cos(2*M_PI*(n+Tw/2)/Tw)); + } + + for (int mic_nr = 0; mic_nr < nr_of_mics ; mic_nr++) + { + // [x_1 x_2 ... x_N y_1 y_2 ... y_N z_1 z_2 ... z_N] + r[0] = rr[mic_nr + 0*nr_of_mics] / cTs; + r[1] = rr[mic_nr + 1*nr_of_mics] / cTs; + r[2] = rr[mic_nr + 2*nr_of_mics] / cTs; + + n1 = (int) ceil(nsamples/(2*L[0])); + n2 = (int) ceil(nsamples/(2*L[1])); + n3 = (int) ceil(nsamples/(2*L[2])); + + // Generate room impulse response + for (mx = -n1 ; mx <= n1 ; mx++) + { + hu[0] = 2*mx*L[0]; + + for (my = -n2 ; my <= n2 ; my++) + { + hu[1] = 2*my*L[1]; + + for (mz = -n3 ; mz <= n3 ; mz++) + { + hu[2] = 2*mz*L[2]; + + for (q = 0 ; q <= 1 ; q++) + { + hu[3] = (1-2*q)*s[0] - r[0] + hu[0]; + refl[0] = pow(beta[0], abs(mx-q)) * pow(beta[1], abs(mx)); + + for (j = 0 ; j <= 1 ; j++) + { + hu[4] = (1-2*j)*s[1] - r[1] + hu[1]; + refl[1] = pow(beta[2], abs(my-j)) * pow(beta[3], abs(my)); + + for (k = 0 ; k <= 1 ; k++) + { + hu[5] = (1-2*k)*s[2] - r[2] + hu[2]; + refl[2] = pow(beta[4],abs(mz-k)) * pow(beta[5], abs(mz)); + + dist = sqrt(pow(hu[3], 2) + pow(hu[4], 2) + pow(hu[5], 2)); + + if (abs(2*mx-q)+abs(2*my-j)+abs(2*mz-k) <= order || order == -1) + { + fdist = (int) floor(dist); + if (fdist < nsamples) + { + strength = sim_microphone(hu[3], hu[4], hu[5], angle, mtype[0]) + * refl[0]*refl[1]*refl[2]/(4*M_PI*dist*cTs); + + for (n = 0 ; n < Tw+1 ; n++) + LPI[n] = hanning_window[n] * Fc * sinc(M_PI*Fc*(n-(dist-fdist)-(Tw/2))); + + pos = fdist-(Tw/2); + for (n = 0 ; n < Tw+1; n++) + if (pos+n >= 0 && pos+n < nsamples) + imp[mic_nr + nr_of_mics*(pos+n)] += strength * LPI[n]; + } + } + } + } + } + } + } + } + + // 'Original' high-pass filter as proposed by Allen and Berkley. + if (hp_filter == 1) + { + for (int idx = 0 ; idx < 3 ; idx++) {Y[idx] = 0;} + for (int idx = 0 ; idx < nsamples ; idx++) + { + X0 = imp[mic_nr+nr_of_mics*idx]; + Y[2] = Y[1]; + Y[1] = Y[0]; + Y[0] = B1*Y[1] + B2*Y[2] + X0; + imp[mic_nr+nr_of_mics*idx] = Y[0] + A1*Y[1] + R1*Y[2]; + } + } + } +} + diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh index 043cf973a47..ee5e29ab50a 100755 --- a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh +++ b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh @@ -32,7 +32,6 @@ if [ $stage -le 1 ]; then data/impulses_noises || exit 1; # corrupt the fisher data to generate multi-condition data - # for data_dir in train dev test; do for data_dir in train dev test; do if [ "$data_dir" == "train" ]; then num_reps=$num_data_reps diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_common_clean.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_common_clean.sh new file mode 100755 index 00000000000..a91e436697c --- /dev/null +++ b/egs/aspire/s5/local/multi_condition/run_nnet2_common_clean.sh @@ -0,0 +1,91 @@ +#!/bin/bash +#set -e +# this script is based on local/online/run_nnet2_comman.sh +# but it operates on corrupted training/dev/test data sets + +. cmd.sh + +stage=2 +snrs="20:10:15:5:0" +num_data_reps=3 +ali_dir=exp/ +db_string="'air' 'rwcp' 'rvb2014'" # RIR dbs to be used in the experiment + # only dbs used for ASpIRE submission system have been used here +RIR_home=db/RIR_databases/ # parent directory of the RIR databases files +download_rirs=false # download the RIR databases from the urls or assume they are present in the RIR_home directory + +set -e +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +# check if the required tools are present +local/multi_condition/check_version.sh || exit 1; + +mkdir -p exp/nnet2_multicondition + +if [ $stage -le 2 ]; then + mfccdir=mfcc + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english_reverb-$date/s5/$mfccdir/storage $mfccdir/storage + fi + for data_dir in train ; do + utils/copy_data_dir.sh data/$data_dir data/${data_dir}_hires + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${data_dir}_hires \ + exp/make_reverb_hires/${data_dir} $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${data_dir}_hires exp/make_reverb_hires/${data_dir} $mfccdir || exit 1; + utils/fix_data_dir.sh data/${data_dir}_hires + utils/validate_data_dir.sh data/${data_dir}_hires + done + + awk '{printf "%s\n", $1}' data/train_100k/utt2spk > uttlist + utils/subset_data_dir.sh --utt-list uttlist \ + data/train_hires data/train_hires_100k + rm uttlist + utils/subset_data_dir.sh data/train_hires 30000 data/train_hires_30k +fi + +if [ $stage -le 3 ]; then + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We use --num-iters 13 because after we get + # the transform (12th iter is the last), any further training is pointless. + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --splice-opts "--left-context=3 --right-context=3" \ + 5000 10000 data/train_hires_100k data/lang exp/tri4a exp/nnet2_multicondition/tri5a_clean +fi + + +if [ $stage -le 4 ]; then + # To train a diagonal UBM we don't need very much data, so use the smallest + # subset. the input directory exp/nnet2_online/tri5a is only needed for + # the splice-opts and the LDA transform. + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \ + data/train_hires_30k 512 exp/nnet2_multicondition/tri5a_clean \ + exp/nnet2_multicondition/diag_ubm_clean +fi + +if [ $stage -le 5 ]; then + # iVector extractors can in general be sensitive to the amount of data, but + # this one has a fairly small dim (defaults to 100) so we don't use all of it, + # we use just the 100k subset (about one sixteenth of the data). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/train_hires_100k exp/nnet2_multicondition/diag_ubm_clean \ + exp/nnet2_multicondition/extractor_clean || exit 1; +fi + +if [ $stage -le 6 ]; then + ivectordir=exp/nnet2_multicondition/ivectors_train_clean + if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then # this shows how you can split across multiple file-systems. + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english_reverb/s5/$ivectordir/storage $ivectordir/storage + fi + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 \ + data/train_hires data/train_hires_max2 + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \ + data/train_hires_max2 exp/nnet2_multicondition/extractor_clean $ivectordir || exit 1; +fi diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_common_sim.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_common_sim.sh new file mode 100755 index 00000000000..15fc171ca52 --- /dev/null +++ b/egs/aspire/s5/local/multi_condition/run_nnet2_common_sim.sh @@ -0,0 +1,141 @@ +#!/bin/bash +#set -e +# this script is based on local/online/run_nnet2_comman.sh +# but it operates on corrupted training/dev/test data sets + +. cmd.sh + +stage=0 +snrs="20:10:15:5:0" +num_data_reps=1 +ali_dir=exp/ +db_string="'sim_rirs'" # RIR dbs to be used in the experiment + # only dbs used for ASpIRE submission system have been used here +RIR_home=db/RIR_databases/ # parent directory of the RIR databases files +download_rirs=false # download the RIR databases from the urls or assume they are present in the RIR_home directory + +set -e +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +# check if the required tools are present +local/multi_condition/check_version.sh || exit 1; + +mkdir -p exp/nnet2_multicondition +if [ $stage -le 0 ]; then + # prepare the impulse responses + local/multi_condition/prepare_impulses_noises.sh --log-dir exp/make_reverb_sim/log \ + --db-string "$db_string" \ + --download-rirs $download_rirs \ + --RIR-home $RIR_home \ + data/impulses_noises_sim || exit 1; +exit +fi + + +if [ $stage -le 1 ]; then + # corrupt the fisher data to generate multi-condition data + # for data_dir in train dev test; do + for data_dir in train dev test; do + if [ "$data_dir" == "train" ]; then + num_reps=$num_data_reps + else + num_reps=1 + fi + reverb_data_dirs= + for i in `seq 1 $num_reps`; do + cur_dest_dir=" data/temp_${data_dir}_${i}" + local/multi_condition/reverberate_data_dir.sh --random-seed $i --log-dir exp/make_reverb/log \ + --snrs "$snrs" --log-dir exp/make_corrupted_wav \ + data/${data_dir} data/impulses_noises_sim $cur_dest_dir + reverb_data_dirs+=" $cur_dest_dir" + done + utils/combine_data.sh --extra-files utt2uniq data/${data_dir}_rvbsim $reverb_data_dirs + rm -rf $reverb_data_dirs + done + + # create the dev, test and eval sets from the aspire recipe +# local/multi_condition/aspire_data_prep.sh + + # copy the alignments for the newly created utterance ids + ali_dirs= + for i in `seq 1 $num_data_reps`; do + local/multi_condition/copy_ali_dir.sh --utt-prefix "rev${i}_" exp/tri5a exp/tri5a_temp_$i || exit 1; + ali_dirs+=" exp/tri5a_temp_$i" + done + local/multi_condition/combine_ali_dirs.sh --ref-data-dir data/train_rvbsim \ + exp/tri5a_rvbsim_ali $ali_dirs || exit 1; + + # copy the alignments for training the 100k system (from tri4a) + local/multi_condition/copy_ali_dir.sh --utt-prefix "rev1_" exp/tri4a exp/tri4a_rvbsim || exit 1; +fi + +if [ $stage -le 2 ]; then + mfccdir=mfcc_reverb + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english_reverb-$date/s5/$mfccdir/storage $mfccdir/storage + fi + for data_dir in train_rvbsim dev_rvbsim test_rvbsim ; do + utils/copy_data_dir.sh data/$data_dir data/${data_dir}_hires + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${data_dir}_hires \ + exp/make_reverb_hires/${data_dir} $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${data_dir}_hires exp/make_reverb_hires/${data_dir} $mfccdir || exit 1; + utils/fix_data_dir.sh data/${data_dir}_hires + utils/validate_data_dir.sh data/${data_dir}_hires + done + + # want the 100k subset to exactly match train_100k, since we'll use its alignments. + awk -v p='rev1_' '{printf "%s%s\n", p, $1}' data/train_100k/utt2spk > uttlist + #while read line; do grep $line data/train_rvbsim_hires/utt2spk|head -1; done < uttlist |awk '{print $1}' > uttlist2 + #mv uttlist2 uttlist + utils/subset_data_dir.sh --utt-list uttlist \ + data/train_rvbsim_hires data/train_rvbsim_hires_100k + rm uttlist + utils/subset_data_dir.sh data/train_rvbsim_hires 30000 data/train_rvbsim_hires_30k +fi + +if [ $stage -le 3 ]; then + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We use --num-iters 13 because after we get + # the transform (12th iter is the last), any further training is pointless. + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --splice-opts "--left-context=3 --right-context=3" \ + 5000 10000 data/train_rvbsim_hires_100k data/lang exp/tri4a_rvbsim exp/nnet2_multicondition/tri5a_sim +fi + + +if [ $stage -le 4 ]; then + # To train a diagonal UBM we don't need very much data, so use the smallest + # subset. the input directory exp/nnet2_online/tri5a is only needed for + # the splice-opts and the LDA transform. + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \ + data/train_rvbsim_hires_30k 512 exp/nnet2_multicondition/tri5a_sim \ + exp/nnet2_multicondition/diag_ubm_sim +fi + +if [ $stage -le 5 ]; then + # iVector extractors can in general be sensitive to the amount of data, but + # this one has a fairly small dim (defaults to 100) so we don't use all of it, + # we use just the 100k subset (about one sixteenth of the data). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/train_rvbsim_hires_100k exp/nnet2_multicondition/diag_ubm_sim \ + exp/nnet2_multicondition/extractor_sim || exit 1; +fi + +if [ $stage -le 6 ]; then + ivectordir=exp/nnet2_multicondition/ivectors_train_sim + if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then # this shows how you can split across multiple file-systems. + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english_reverb/s5/$ivectordir/storage $ivectordir/storage + fi + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). +# steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 \ +# data/train_rvbsim_hires data/train_rvbsim_hires_max2 + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \ + data/train_rvbsim_hires_max2 exp/nnet2_multicondition/extractor_sim $ivectordir || exit 1; +fi diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh index 8119ff44661..4be5efe25ec 100755 --- a/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh +++ b/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh @@ -13,7 +13,6 @@ stage=1 train_stage=-10 use_gpu=true dir=exp/nnet2_multicondition/nnet_ms_a -dest_wav_dir=data/rvb_wavs # directory to store the reverberated wav files set -e . cmd.sh @@ -52,7 +51,7 @@ else fi # do the common parts of the script. -local/multi_condition/run_nnet2_common.sh --dest-wav-dir $dest_wav_dir --stage $stage +local/multi_condition/run_nnet2_common.sh --stage $stage if [ $stage -le 7 ]; then diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_ms_clean.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_ms_clean.sh new file mode 100755 index 00000000000..6a6678f8542 --- /dev/null +++ b/egs/aspire/s5/local/multi_condition/run_nnet2_ms_clean.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +# This is the "multi-splice" version of the online-nnet2 training script. +# It's currently the best recipe for aspire. +# You'll notice that we splice over successively larger windows as we go deeper +# into the network. The temporal context used for training on reverberant data +# is larger than that used for other LVCSR recipes. + +. cmd.sh + + +stage=7 +train_stage=-10 +use_gpu=true +dir=exp/nnet2_multicondition/nnet_ms_clean + +set -e +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if $use_gpu; then + if ! cuda-compiled; then + cat <