Skip to content

Instantly share code, notes, and snippets.

@quanhua92
Created September 8, 2019 02:46
Show Gist options
  • Save quanhua92/23dbd1c17da48cdc3ae12d626bf1abc3 to your computer and use it in GitHub Desktop.
Save quanhua92/23dbd1c17da48cdc3ae12d626bf1abc3 to your computer and use it in GitHub Desktop.
K14513_CD_Files\MATLAB_code\objective_measures\quality
function addnoise_asl(cleanfile, noisefile, outfile, snr)
% ----------------------------------------------------------------------
% This function adds noise to a file at a specified SNR level. It uses
% the active speech level to compute the speech energy. The
% active speech level1 is computed as per ITU-T P.56 standard [1].
%
% Usage: addnoise_asl(cleanFile.wav, noiseFile.wav, noisyFile.wav, SNR)
%
% cleanFile.wav - clean input file in .wav format
% noiseFile.wav - file containing the noise signal in .wav format
% noisyFile.wav - resulting noisy file
% SNR - desired SNR in dB
%
% Note that if the variable IRS below (line 38) is set to 1, then it applies the IRS
% filter to bandlimit the signal to 300 Hz - 3.2 kHz. The default IRS
% value is 0, ie, no IRS filtering is applied.
%
% Example call:
% addnoise_asl('sp04.wav','white_noise.wav','sp04_white_5db.wav',5);
%
%
% References:
% [1] ITU-T (1993). Objective measurement of active speech level. ITU-T
% Recommendation P. 56
%
% Author: Yi Hu and Philipos C. Loizou
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin ~=4
fprintf('USAGE: addnoise_asl(cleanFile.wav, noiseFile.wav, noisyFile.wav, SNR) \n');
fprintf('For more help, type: help addnoise_asl\n\n');
return;
end
IRS=0; % if 1 apply IRS filter simulating telephone handset bandwidth (300 Hz -3.2 kHz)
% wavread gives floating point column data
[clean, srate, nbits]= wavread(cleanfile);
% filter clean speech with irs filter
if IRS==1, clean= apply_IRS( clean, srate, nbits); end;
[Px, asl, c0]= asl_P56 ( clean, srate, nbits);
% Px is the active speech level ms energy, asl is the active factor, and c0
% is the active speech level threshold.
x=clean;
x_len= length( x); % length of speech signal
[noise, srate1, nbits1]= wavread( noisefile);
if (srate1~= srate)| (nbits1~= nbits)
error( 'the formats of the two files dont match!');
end
noise_len= length( noise);
if (noise_len<= x_len)
error( 'the noise length has to be greater than speech length!');
end
rand_start_limit= noise_len- x_len+ 1;
% the start of the noise segment can vary between [1 rand_start_limit]
rand_start= round( (rand_start_limit- 1)* rand( 1)+ 1);
% random start of the noise segment
noise_segment= noise( rand_start: rand_start+ x_len- 1);
if IRS==1, noise_segment= apply_IRS( noise_segment, srate, nbits); end;
% this is the randomly selected noise segment that will be added to the
% clean speech x
Pn= noise_segment'* noise_segment/ x_len;
% we need to scale the noise segment samples to obtain the desired snr= 10*
% log10( Px/ (sf^2 * Pn))
sf= sqrt( double(Px/Pn/ (10^ (snr/ 10)))); % scale factor for noise segment data
noise_segment= noise_segment * sf;
noisy = x+ noise_segment;
if ( (max( noisy)>= 1) | (min( noisy)< -1))
error( 'Overflow occurred!\n');
end;
wavwrite( noisy, srate, nbits, outfile);
fprintf( 1, '\n NOTE: For comparison, the SNR based on long-term RMS level is %4.2f dB.\n\n', 10*log10((x'*x)/ ...
(noise_segment'*noise_segment)));
%------------------------------------------------------------------------
function data_filtered= apply_IRS( data, Fs, nbits);
n= length( data);
% now find the next power of 2 which is greater or equal to n
pow_of_2= 2^ (ceil( log2( n)));
align_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;...
250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;...
1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;...
3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200];
[number_of_points, trivial]= size( align_filter_dB);
overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ...
1000);
x= zeros( 1, pow_of_2);
x( 1: n)= data;
x_fft= fft( x, pow_of_2);
freq_resolution= Fs/ pow_of_2;
factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ...
align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ...
overallGainFilter;
factor= 10.^ (factorDb/ 20);
factor= [factor, fliplr( factor( 2: pow_of_2/2))];
x_fft= x_fft.* factor;
y= ifft( x_fft, pow_of_2);
data_filtered= y( 1: n)';
function [asl_ms, asl, c0]= asl_P56 ( x, fs, nbits)
% this implements ITU P.56 method B.
% 'speechfile' is the speech file to calculate active speech level for,
% 'asl' is the active speech level (between 0 and 1),
% 'asl_rms' is the active speech level mean square energy.
% x is the column vector of floating point speech data
x= x(:); % make sure x is column vector
T= 0.03; % time constant of smoothing, in seconds
H= 0.2; % hangover time in seconds
M= 15.9;
% margin in dB of the difference between threshold and active speech level
thres_no= nbits- 1; % number of thresholds, for 16 bit, it's 15
I= ceil( fs* H); % hangover in samples
g= exp( -1/( fs* T)); % smoothing factor in envelop detection
c( 1: thres_no)= 2.^ (-15: thres_no- 16);
% vector with thresholds from one quantizing level up to half the maximum
% code, at a step of 2, in the case of 16bit samples, from 2^-15 to 0.5;
a( 1: thres_no)= 0; % activity counter for each level threshold
hang( 1: thres_no)= I; % hangover counter for each level threshold
sq= x'* x; % long-term level square energy of x
x_len= length( x); % length of x
% use a 2nd order IIR filter to detect the envelope q
x_abs= abs( x);
p= filter( 1-g, [1 -g], x_abs);
q= filter( 1-g, [1 -g], p);
for k= 1: x_len
for j= 1: thres_no
if (q(k)>= c(j))
a(j)= a(j)+ 1;
hang(j)= 0;
elseif (hang(j)< I)
a(j)= a(j)+ 1;
hang(j)= hang(j)+ 1;
else
break;
end
end
end
asl= 0;
asl_rms= 0;
if (a(1)== 0)
return;
else
AdB1= 10* log10( sq/ a(1)+ eps);
end
CdB1= 20* log10( c(1)+ eps);
if (AdB1- CdB1< M)
return;
end
AdB(1)= AdB1;
CdB(1)= CdB1;
Delta(1)= AdB1- CdB1;
for j= 2: thres_no
AdB(j)= 10* log10( sq/ (a(j)+ eps)+ eps);
CdB(j)= 20* log10( c(j)+ eps);
end
for j= 2: thres_no
if (a(j) ~= 0)
Delta(j)= AdB(j)- CdB(j);
if (Delta(j)<= M)
% interpolate to find the asl
[asl_ms_log, cl0]= bin_interp( AdB(j), ...
AdB(j-1), CdB(j), CdB(j-1), M, 0.5);
asl_ms= 10^ (asl_ms_log/ 10);
asl= (sq/ x_len)/ asl_ms;
c0= 10^( cl0/ 20);
break;
end
end
end
function [asl_ms_log, cc]= bin_interp(upcount, lwcount, ...
upthr, lwthr, Margin, tol)
if (tol < 0)
tol = -tol;
end
% Check if extreme counts are not already the true active value
iterno = 1;
if (abs(upcount - upthr - Margin) < tol)
asl_ms_log= upcount;
cc= upthr;
return;
end
if (abs(lwcount - lwthr - Margin) < tol)
asl_ms_log= lwcount;
cc= lwthr;
return;
end
% Initialize first middle for given (initial) bounds
midcount = (upcount + lwcount) / 2.0;
midthr = (upthr + lwthr) / 2.0;
% Repeats loop until `diff' falls inside the tolerance (-tol<=diff<=tol)
while ( 1)
diff= midcount- midthr- Margin;
if (abs(diff)<= tol)
break;
end
% if tolerance is not met up to 20 iteractions, then relax the
% tolerance by 10%
iterno= iterno+ 1;
if (iterno>20)
tol = tol* 1.1;
end
if (diff> tol) % then new bounds are ...
midcount = (upcount + midcount) / 2.0;
% upper and middle activities
midthr = (upthr + midthr) / 2.0;
% ... and thresholds
elseif (diff< -tol) % then new bounds are ...
midcount = (midcount + lwcount) / 2.0;
% middle and lower activities
midthr = (midthr + lwthr) / 2.0;
% ... and thresholds
end
end
% Since the tolerance has been satisfied, midcount is selected
% as the interpolated value with a tol [dB] tolerance.
asl_ms_log= midcount;
cc= midthr;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment