Skip to content

Instantly share code, notes, and snippets.

@naoh16
Created February 3, 2012 16:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save naoh16/1731120 to your computer and use it in GitHub Desktop.
Save naoh16/1731120 to your computer and use it in GitHub Desktop.
Calculate SNR (Signal-to-Noise Ratio) based on an assumption of two-mixture Gaussian distribution
/**
* emsnr
* SNRを計算する
*
* Usage: echo datafilename | emsnr -r residual -l maxloop
* Output: filename SNR SNR_fix SNR_ptile weight low_power high_power low_variance high_variance loglikelyhood
*
* 2012-02-18 Add licence
* 2008-10-12 引数をすべてオプションに変更(-h, -s, -x, -r, -l)
* Pre-Emphasis オプション€(-e)
* 2008-10-10 E-stepとM-stepが混ざっていたのを修正
* HTK-Likeな初期値の指定
* データの80%tile-20%tileをSNR_ptileとして出力
* 2007-08-02 DAT法によるSNRも出力
* 2006-12-24 平均値の初期値を変更(10%タイルと90%タイルの値に)
* 2006-12-24 平均値の初期値を変更(max,minから90%の値に)
* 精度向上(double->long double)
* 2004-10-31 尤度がNaNの時は終了するように変更
* 2004-10-12 分散の下限を設定
* 2004-09-24 メモリ開放忘れ,ファイル閉じ忘れ修正
* 2004-05-13
*
* Bibliography
* [1] T. H. Dat, K. Takeda, F. Itakura,
* "On-line Gaussian mixture modeling in the log-power
* domain for signal-to-noise ratio estimation and speech enhancement,"
* Speech Comunication, vol. 48, Issue 11, pp. 1515-1527, 2006.
* [2] T. H. Dat, K. Takeda, F. Itakura,
* "MULTICHANNEL SPEECH ENHANCEMENT BASED ON SPEECH SPECTRAL
* MAGNITUDE ESTIMATION USING GENERALIZED GAMMA PRIOR DISTRIBUTION,"
* in Proceedings of ICASSP, 2006.
*
* Last Modified: 2012/02/18 16:31:07.
*/
/*************** LICENSE *****************************************
"emsnr" is licensed under the modified BSD license (3-clause license).
Copyright (c) 2011-2012 Sunao Hara <hara@is.naist.jp>, NAIST/JAPAN.
Copyright (c) 2004-2011 Sunao Hara <naoh@nagoya-u.jp>, Takeda Laboratory, Nagoya University.
Copyright (c) 2002-2004 Hiroshi Fujimura, Takeda Laboratory, Nagoya University.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the author, the organization, nor the contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#ifdef _MSC_VER
# include <search.h>
# include <float.h>
# define isnan(x) _isnan(x)
#endif
/**
* INITIALIZE_MODE
* 1 ... 平均値 = データの最小値と最大値の間で10%と90%
* 分散 = 1.0
* 2 ... fujimura Original
* 平均値 = データの10%-tileと90%-tile
* 分散 = 1.0
* 3 ... HTK Like
* 平均値 = 標本平均からプラスマイナス標本分散*0.2だけ離れた点
* 分散 = 標本分散 / 2
*/
#define INITIALIZE_MODE 3
#define TPI (double)6.283185307179586476925286766559
#define ZERO_POWER -100.0
//#define VAR_FLOOR 1.0E-3
//#define LAMBDA_FLOOR 1.0E-3
#define VAR_FLOOR 1.0
#define VAR_CEIL 500.0
#define LAMBDA_FLOOR 0.2
// at 16000 [sample/sec]
// 1 [ms] = 16 [sample]
// 2 [ms] = 32 [sample]
// 4 [ms] = 64 [sample]
// at 8000 [sample/sec]
// 1 [ms] = 8 [sample]
// 2 [ms] = 16 [sample]
// 4 [ms] = 32 [sample]
#define FRAME_SHIFT 8
#define WINDOW_SIZE 32
// defined in stdio.h
//#define FILENAME_MAX 1024
double g_hamming_table[WINDOW_SIZE];
typedef struct {
double *data;
long len;
double mu1;
double mu2;
double sigma1;
double sigma2;
double lambda;
} GAUSS_PROB_PARAM;
int my_compare_double(const void *a, const void *b)
{
double* da = (double *)a;
double* db = (double *)b;
return (*da > *db)?1:-1;
}
/**
* short(16bit)のデータをdouble型にして誌錡€
*/
double* i16d_readdata(char *filename, long *len, int bSwap) {
short *tmp_data;
double *data, *p;
long i, _len;
// char buf[256];
FILE *fp;
fp = fopen(filename, "rb");
if(!fp){
fprintf(stderr, "Error at fopen(i16d_readdata).\n");
return NULL;
}
// Calculate length of the file
{
fseek(fp, 0L, SEEK_END);
_len=ftell(fp) / sizeof(short);
*len = _len;
fseek(fp, 0L, SEEK_SET);
}
// Skip RIFF header chunks
if(_len>4) {
unsigned char buf[5];
unsigned long chanklen;
fread(buf, sizeof(char), 4, fp); // "RIFF"
buf[4] = '\0';
if(buf[0]=='R' && buf[1]=='I' && buf[2]=='F' && buf[3]=='F') {
fprintf(stderr, "Caution: RIFF File...");
fread(buf, sizeof(char), 4, fp); // "RIFF" length
fread(buf, sizeof(char), 4, fp); // "WAVE"
fread(buf, sizeof(char), 4, fp); // "fmt "
while(!(buf[0]=='d' && buf[1]=='a' && buf[2]=='t' && buf[3]=='a')) {
// printf("%s", buf);
fread(&chanklen, sizeof(long), 1, fp); // "????" length
fseek(fp, chanklen, SEEK_CUR);
fread(buf, sizeof(char), 4, fp); // "????"
}
// printf("%s", buf);
fread(&chanklen, sizeof(long), 1, fp); // "????" length
// printf("[%d->%d]\n", _len, chanklen/sizeof(short));
_len = chanklen / sizeof(short);
*len = _len;
fprintf(stderr, "header was removed.\n");
} else {
// return to start position, if the file format is not RIFF.
fseek(fp, 0L, SEEK_SET);
}
}
/*
* allocate the workspace memories
*/
tmp_data = malloc(_len * sizeof(short));
if(tmp_data == NULL){
fprintf(stderr, "Error at malloc_tmpdata(i16d_readdata).\n");
return NULL;
}
data = malloc(_len * sizeof(double));
if(data == NULL){
fprintf(stderr, "Error at malloc_data(i16d_readdata).\n");
free(tmp_data);
return NULL;
}
// load.
fread(tmp_data, sizeof(short), _len, fp);
if(bSwap) {
unsigned char *p, tmp;
p = (unsigned char *)tmp_data;
// swap
for(i=0; i<_len; i++){
tmp = p[2*i];
p[2*i] = p[2*i+1];
p[2*i+1] = tmp;
}
}
// doubleの配列にコピー(代入).
// zeroデータへの対策のために雑音追加.
{
double data_avg = 0.0;
for(i=0; i<_len; i++){
data[i] = ((double)tmp_data[i]+0.1*(rand()%2)) / 32768.0;
data_avg += data[i];
}
data_avg = data_avg / (double)_len;
// mean normalization.
for(i=0; i<_len; i++){
data[i] -= data_avg;
}
}
free(tmp_data); tmp_data = NULL;
fclose(fp);
return data;
}
void init_hamming() {
int n;
double len_1 = (double)(WINDOW_SIZE - 1);
for(n=0; n<WINDOW_SIZE; n++) {
g_hamming_table[n] = (0.54 - 0.46 * cos(TPI*(double)n / len_1));
}
}
void dd_preemphasis_I(double* srcdst, long len) {
long s;
for(s=len-1; s>0; --s) {
srcdst[s] = srcdst[s] - 0.97*srcdst[s-1];
}
}
double* dd_hamming(double* src, double *dest, long len) {
long n;
for(n=0; n<len; n++) {
dest[n] = src[n] * g_hamming_table[n];
}
return dest;
}
double dd_avepower(double* src, long len) {
double result = 0.0;
long i;
for(i=0; i<len; i++){
result += src[i]*src[i] / (double)(len);
}
// if(result < 1.0E-16) {
// result = 1.0E-8 * (rand()%10) / (double)len;
// }
return result;
}
double* dd_frame_power(double* src, long src_len, long* dest_len) {
long f = 0;
long frame_len;
double *frame_power;
double *frame_tmp;
frame_len = (long)((double)(src_len - WINDOW_SIZE) / (double)FRAME_SHIFT + 0.5);
frame_power = (double *)malloc(frame_len*sizeof(double));
frame_tmp = (double *)malloc(WINDOW_SIZE*sizeof(double));
for(f=0; f<frame_len; f++){
dd_hamming(&src[f*FRAME_SHIFT], frame_tmp, WINDOW_SIZE);
frame_power[f] = dd_avepower(frame_tmp, WINDOW_SIZE);
frame_power[f] = 10.0 * log10(frame_power[f]);
// printf("%d:%f\n", f, frame_power[f]);
}
*dest_len = frame_len;
free(frame_tmp); frame_tmp = NULL;
return frame_power;
}
double gauss_prob(double x, double mu, double sigma){
double x_mu = x-mu;
return exp(-(x_mu*x_mu)/(2.0*sigma)) / sqrt(TPI*sigma);
}
double mix2_gauss_prob(double x, GAUSS_PROB_PARAM* param){
// double x_mu1, x_mu2;
// x_mu1 = x - param->mu1;
// x_mu2 = x - param->mu2;
return param->lambda * gauss_prob(x, param->mu1, param->sigma1)
+ (1.0-param->lambda) * gauss_prob(x, param->mu2, param->sigma2);
}
void init_gaussparam(GAUSS_PROB_PARAM *param, double* data, long len){
long i;
// 重みの初期値は等分(=0.5).
param->lambda = 0.5;
#if INITIALIZE_MODE==3
{
// 標本平均から標本標準偏差だけずれた点を各平均とする.
long double s_mean = 0.0L;
long double s_var = 0.0L;
long double s_sd = 0.0L;
for(i=0; i<len; ++i) {
s_mean += data[i];
}
s_mean /= (double)len;
for(i=0; i<len; ++i) {
double x_mean = data[i]-s_mean;
s_var += x_mean * x_mean;
}
s_var /= (double)(len - 1);
s_sd = sqrt(s_var);
param->mu1 = s_mean - 0.2*s_sd;
param->mu2 = s_mean + 0.2*s_sd;
param->sigma1 = 0.8 * s_var;
param->sigma2 = 0.8 * s_var;
}
#elif INITIALIZE_MODE==2
{
// sortしてパーセンタイルで初期値決定.
qsort(data, len, sizeof(double), (int (*)(const void*, const void*))my_compare_double);
// for(i=0; i<len; ++i) {
// printf("%d: %f\n", i, data[i]);
// }
param->mu1 = data[(int)(0.1*len)];
param->mu2 = data[(int)(0.9*len)];
// 分散の初期値は10.
param->sigma1 = 10.0;
param->sigma2 = 10.0;
}
#else
{
double mu1, mu2;
// 平均の初期値は学習データの最小値と最大値.
mu1 = mu2 = data[0];
for(i=1; i<len; i++){
if(data[i] < mu1) mu1 = data[i];
if(data[i] > mu2) mu2 = data[i];
}
// param->mu1 = mu1;
// param->mu2 = mu2;
// オリジナルは90パーセンタイルだがココでは90%相当の値とする.
param->mu1 = mu1 + 0.1 * (mu2 - mu1);
param->mu2 = mu2 - 0.1 * (mu2 - mu1);
// 分散の初期値は1.
param->sigma1 = 1.0;
param->sigma2 = 1.0;
}
#endif
}
double loglikelyhood(GAUSS_PROB_PARAM* param, double* data, int data_len){
int i;
double sum=0.0;
for(i=0; i<data_len; i++){
sum += log(mix2_gauss_prob(data[i], param)+1.0E-12);
}
return sum;
}
double em_2mixture(GAUSS_PROB_PARAM *param, int loop_count, double apply_residual){
int i, k;
int same_count=0;
long double new_loglikelyhood = 0.0, old_loglikelyhood = 0.0;
double *data = param->data;
long len = param->len;
// パラメータ初期化
init_gaussparam(param, data, len);
new_loglikelyhood = loglikelyhood(param, data, len);
#if _DEBUG
printf("#loop lambda mu1 sigma1 mu2 sigma2 loglikelyhood\n");
printf("%3d %f %f %f %f %f %Lf\n",
-1, param->lambda, param->mu1, param->sigma1, param->mu2, param->sigma2, new_loglikelyhood);
#endif
for(k=0; k<loop_count; k++){
long double prob_m1, prob_m2, prob_mix, prob_Q1, prob_Q2;
long double prob_sum1=0.0, prob_sum2=0.0;
long double new_mu1=0.0, new_mu2=0.0;
long double new_sigma1=0.0, new_sigma2=0.0;
long double new_lambda=0.0;
long double x_mu;
// 期待値計算.
for(i=0; i<len; i++){
//
// E-step
//
prob_mix = mix2_gauss_prob(data[i], param);
prob_m1 = param->lambda * gauss_prob(data[i], param->mu1, param->sigma1);
prob_Q1 = prob_m1 / prob_mix;
prob_m2 = (1.0 - param->lambda) * gauss_prob(data[i], param->mu2, param->sigma2);
prob_Q2 = prob_m2 / prob_mix;
//
// M-step
//
// weight
prob_sum1 += prob_Q1;
prob_sum2 += prob_Q2;
// mu
new_mu1 += data[i] * prob_Q1;
new_mu2 += data[i] * prob_Q2;
// sigma
x_mu = data[i] - param->mu1;
new_sigma1 += x_mu * x_mu * prob_Q1;
x_mu = data[i] - param->mu2;
new_sigma2 += x_mu * x_mu * prob_Q2;
}
new_lambda = prob_sum1 / (prob_sum1 + prob_sum2);
new_mu1 /= prob_sum1;
new_mu2 /= prob_sum2;
new_sigma1 /= prob_sum1;
new_sigma2 /= prob_sum2;
// 重みフロアリング.
// if(new_lambda < LAMBDA_FLOOR) new_lambda = LAMBDA_FLOOR;
// else if(new_lambda > (1.0-LAMBDA_FLOOR)) new_lambda = (1.0-LAMBDA_FLOOR);
if(new_lambda < LAMBDA_FLOOR) new_lambda = 0.5;
else if(new_lambda > (1.0-LAMBDA_FLOOR)) new_lambda = 0.5;
// 分散フロアリング.
if(new_sigma1 < VAR_FLOOR) new_sigma1 = VAR_FLOOR;
if(new_sigma2 < VAR_FLOOR) new_sigma2 = VAR_FLOOR;
if(new_sigma1 > VAR_CEIL) new_sigma1 = VAR_CEIL;
if(new_sigma2 > VAR_CEIL) new_sigma2 = VAR_CEIL;
// パラメータ更新.
param->lambda = new_lambda;
param->mu1 = new_mu1;
param->mu2 = new_mu2;
param->sigma1 = new_sigma1;
param->sigma2 = new_sigma2;
// 新パラメータでの尤度計算.
new_loglikelyhood = loglikelyhood(param, data, len);
#if _DEBUG
printf("%3d %f %f %f %f %f %Lf\n",
k, param->lambda, param->mu1, param->sigma1, param->mu2, param->sigma2, new_loglikelyhood);
#endif
// NaN 対策.
if(isnan(new_loglikelyhood)) break;
if(new_loglikelyhood - old_loglikelyhood < apply_residual){
same_count++;
}else{
same_count = 0;
}
old_loglikelyhood = new_loglikelyhood;
if(same_count > 10) break;
}
// sort parameters
if( param->mu1 > param->mu2 ) {
double tmp;
param->lambda = 1.0 - param->lambda;
tmp = param->mu1; param->mu1 = param->mu2; param->mu2 = tmp;
tmp = param->sigma1; param->sigma1 = param->sigma2; param->sigma2 = tmp;
}
return new_loglikelyhood;
}
void usage()
{
fprintf(stderr, "usage: emsnr [-h] [-s] [-x] [-r residual] [-l maxloop] < filelist\n");
fprintf(stderr, " -h: output help and exit.\n");
fprintf(stderr, " -s: not output system parameter on startup.\n");
fprintf(stderr, " -x: speech files are treated as BigEndian.\n");
fprintf(stderr, " -r: EM stop paramter. Minimum residual of log-likelyfood updating.\n");
fprintf(stderr, " -l: EM stop paramter. Maximum count of EM loop.\n");
fprintf(stderr, "\n");
fprintf(stderr, "example: echo test.raw | emsnr -r 1.0E-8 -l 1000\n");
}
int main(int argc, char* argv[])
{
int loop_count = 1000;
char filename[FILENAME_MAX], *p;
double *data, *data_raw;
double loglikelyhood = 0.0;
double apply_residual = 1.0E-8;
int bSwap = 0;
int bVerbose = 1;
int bPreEmphasis = 0;
long len_raw, len;
GAUSS_PROB_PARAM param;
long filecount = 0;
double snrsum = 0.0;
srand((unsigned)time(NULL));
// Load arguments.
{
int i = 0;
while(++i<argc) {
if(strcmp(argv[i], "-r")==0) {
apply_residual = atof(argv[++i]);
} else if(strcmp(argv[i], "-l")==0) {
loop_count = atoi(argv[++i]);
} else if(strcmp(argv[i], "-x")==0) {
bSwap = 1;
} else if(strcmp(argv[i], "-e")==0) {
bPreEmphasis = 1;
} else if(strcmp(argv[i], "-s")==0) {
bVerbose = 0;
} else if(strcmp(argv[i], "-h")==0) {
usage();
return 0;
} else {
break;
}
}
}
// Output Parameters
if(bVerbose>0) {
fprintf(stderr, "SYSTEM INFORMATION -----------------------------\n");
fprintf(stderr, " source-file: %s\n", __FILE__);
fprintf(stderr, " compiled at %s %s\n", __DATE__, __TIME__); fprintf(stderr, "SPEECH PARAMETERS ------------------------------\n");
fprintf(stderr, " pre-emphasis: %s\n", (bPreEmphasis)?"True":"False");
fprintf(stderr, " swap: %s\n", (bSwap)?"True":"False");
fprintf(stderr, "EM PARAMETERS ----------------------------------\n");
fprintf(stderr, " loop count: %d\n", loop_count);
fprintf(stderr, " LL update: %e\n", apply_residual);
fprintf(stderr, "OUTPUT -----------------------------------------\n");
fprintf(stderr, "filename seg-SNR fix-SNR ptile-SNR lambda low_power high_power low_variance high_variance loglikelyhood\n");
fprintf(stderr, "================================================\n");
}
// Initialize table of hamming window's coefficients
init_hamming();
// Loop: get filenames from stdin
while(NULL != fgets(&filename[0], FILENAME_MAX, stdin)){
double snr = 0.0;
double snr_fix = 0.0;
double snr_ptile = 0.0;
while(NULL != (p=(char*)strrchr(filename, '\r'))) *p = '\0';
while(NULL != (p=(char*)strrchr(filename, '\n'))) *p = '\0';
// get speech data from file
printf("%s ", filename);
data_raw = i16d_readdata(filename, &len_raw, bSwap);
if(data_raw == NULL) {
fprintf(stderr, "Error at readdata[%s].\n", filename);
return 1;
}
// Pre-Emphasis
if(bPreEmphasis)
dd_preemphasis_I(data_raw, len_raw);
#if _DEBUG
// debug
{
FILE* fpdeb = fopen("data_pre.raw", "wb");
fwrite(data_raw, sizeof(double), len_raw, fpdeb);
fclose(fpdeb);
}
#endif
// frame_powerをdataに.
data = dd_frame_power(data_raw, len_raw, &len);
// fprintf(stderr, "frame len: %d\n", len);
printf("%d ", len);
fflush(stdout);
param.data = data; param.len = len;
loglikelyhood = em_2mixture(&param, loop_count, apply_residual);
snr = fabs(param.mu2 - param.mu1);
// DAT法に基づく SNR 算出.
// 注:ICASSP2005の原稿で d は差にしているが、和の誤植である.
// SpeechComunicationでは修正されている.
{
if(snr > 10.0) {
snr_fix = snr;
} else {
double a = 4.342944819; // = 10.0/ln(10.0);
double ia = 0.2302585093; // = ln(10.0)/10.0;
double iaia = 0.0530189811; // = (ln(10.0)/10.0)^2;
double m = ia * fabs(param.mu2 - param.mu1);
double d = iaia * (param.sigma2 + param.sigma1);
double snr1 = snr - a * 0.7 * exp( -m + 0.5*d ); // -(m-d/2)
double snr2 = snr1 - a * 0.9 * exp( -2.0*m + 2.0*d ); // -2(m-d)
double snr3 = snr2 - a * exp( -3.0*m + 4.5*d ); // -3(m-3d/2)
// 補正が強すぎない範囲で適用する.
// -8.945 = 10/ln(10) ln(e^r-1), r=0.12
// なお、基本的には補正すらことで小さくなるので、$snr1 < $snr0 .
if((snr - snr1 > 8.0) || (snr1 < -8.945)) {
snr_fix = snr;
} else if((snr1 - snr2 > 8.0) || (snr2 < -8.945)) {
snr_fix = snr1;
} else if((snr2 - snr3 > 8.0) || (snr3 < -8.945)) {
snr_fix = snr2;
} else {
snr_fix = snr3;
}
}
}
{
qsort(data, len, sizeof(double), (int (*)(const void*, const void*))my_compare_double);
snr_ptile = data[(int)(0.8*len)] - data[(int)(0.2*len)];
}
printf("%f %f %f %f %f %f %f %f %f\n",
snr, snr_fix, snr_ptile,
param.lambda,
param.mu1, param.mu2,
param.sigma1, param.sigma2,
loglikelyhood);
fflush(stdout);
free(data_raw); data_raw = NULL;
free(data); data = NULL;
memset(&filename[0], 0, FILENAME_MAX);
// 最後の表示用の保存.
if(!isnan(snr)) {
++filecount;
snrsum += snr;
}
}
fprintf(stderr, "Filecount=%d MeanSNR=%.2f\n", filecount, snrsum/filecount);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment