Created
February 2, 2015 05:17
-
-
Save stormxuwz/f90cca402996c492120e to your computer and use it in GitHub Desktop.
MFCC for signal
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import numpy as np; | |
import matplotlib.pyplot as plt | |
import warnings | |
# Ref: http://practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/ | |
# Ref2:http://python-speech-features.readthedocs.org/en/latest/. However, I checked the library code and found there might be a mistake when using rfft | |
def preemphasis(signal,coeff=0.95): | |
return numpy.append(signal[0],signal[1:]-coeff*signal[:-1]) | |
def create_frames(sig,rate,frameSize,overlap=0.5): | |
sig_size=len(sig) | |
num_of_overlap=int(frameSize*overlap) | |
frames=[sig[i:i+frameSize] for i in np.arange(0,sig_size-frameSize,frameSize-num_of_overlap)] | |
return frames; # Frames is a list | |
def spectrogram(frames,nfft=None,returnType="mfcc"): | |
if nfft is None: | |
nfft=len(frames[0]) | |
window=np.hamming(nfft) | |
energy_for_frames=[] | |
for sub_sig in frames: | |
fft_result=np.fft.rfft(sub_sig*window,nfft) | |
energy_for_frames.append(np.abs(fft_result)) | |
energy_for_frames=np.array(energy_for_frames) | |
if returnType=="log": | |
return np.log(energy_for_frames) | |
elif returnType=="mfcc": | |
return energy_for_frames**2/nfft | |
else: | |
return energy_for_frames | |
def mel_filterbank(nfft,rate,nbanks=26,lowfreq=0,highfreq=None): | |
if highfreq is None: | |
highfreq=rate/2 | |
def hz2mel(f): | |
return 1125*np.log(1+f/700) | |
def mel2hz(m): | |
return 700*(np.exp(m/1125)-1) | |
highMel=hz2mel(highfreq) | |
lowMel=hz2mel(lowfreq) | |
melPoints=np.linspace(lowMel,highMel,num=nbanks+2) | |
HzPoints=mel2hz(melPoints) | |
# print HzPoints | |
# print rate/nfft | |
fbins=np.floor(nfft*HzPoints/rate) | |
#floor([HzPoints/(rate/nfft)]+1) | |
fbanks=np.zeros((nbanks,nfft/2+1)) | |
# print fbins | |
for j in range(0,nbanks): | |
# print j | |
for i in range(int(fbins[j]),int(fbins[j+1])): | |
fbanks[j,i]=(i-fbins[j])/(fbins[j+1]-fbins[j]) | |
for i in range(int(fbins[j+1]),int(fbins[j+2])): | |
fbanks[j,i]=(fbins[j+2]-i)/(fbins[j+2]-fbins[j+1]) | |
return fbanks; | |
def apply_melBanks(spectrogram_array,fbanks): | |
print "filterbank size",spectrogram_array.shape,fbanks.shape; | |
return np.dot(spectrogram_array,np.transpose(fbanks)) # return (num_frame,n_banks) | |
def myMFCC(sig,rate,nfft): | |
from scipy.fftpack import dct | |
frames=create_frames(sig,rate,nfft) | |
# print frames[0] | |
spec=spectrogram(frames,nfft) | |
fbanks=mel_filterbank(nfft,rate) | |
melBank_energy=apply_melBanks(spec,fbanks) | |
# print melBank_coeff.shape | |
with warnings.catch_warnings(): | |
warnings.filterwarnings('error') | |
try: | |
log_melBank_energy=np.log(melBank_energy) | |
except: | |
print melBank_energy | |
return dct(log_melBank_energy,type=2,axis=1)[:,:13] # return (n_frame,13) as features | |
if __name__ == '__main__': | |
import os | |
from scipy.io.wavfile import read,write | |
musicFileRoot="../hw3/cs598ps_3/SpeechMusic/music/" | |
rate,sig=read(musicFileRoot+"1.wav") | |
print len(sig)/1024 | |
a=myMFCC(sig,rate,1024) | |
print a.shape |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment