Skip to content

Instantly share code, notes, and snippets.

@sailist
Created June 21, 2019 06:36
Show Gist options
  • Save sailist/d35f3472718e00886380935cda8459f0 to your computer and use it in GitHub Desktop.
Save sailist/d35f3472718e00886380935cda8459f0 to your computer and use it in GitHub Desktop.
用于机器学习的MFCC特征提取方法,提供了对单个音频的特征提取和batch级的音频的特征提取
from keras.layers import Layer
import numpy as np
from keras.layers import Lambda
from librosa.feature import melspectrogram
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import scale as skscale
from scipy import signal
class MelFeature():
'''
该方法实现的Mel特征总觉得不对,停止使用
改为使用MelFeature2/MelFeature3
'''
def __init__(self,sr = 16000,n_fft=2048,hop_length=512,power=2.0,n_mels = 128,**kwargs):
self.sr = sr
self.n_fft = n_fft
self.hop_length = hop_length
self.power = power
self.n_mels = n_mels
def batch_mfcc(self, x):
'''
先提取特征
:param x:
:param kwargs:
:return:
'''
features = list(map(lambda x: self.mfcc(x), x))
return features
def __call__(self,x):
return self.batch_mfcc(x)
def mfcc(self,x):
'''
没有padding
:param x:
:return:
'''
x = x / np.max(np.abs(x)) # 音量归一化
return melspectrogram(x,
sr=self.sr,
n_fft=self.n_fft,
hop_length=self.hop_length,
power=self.power,
n_mels = self.n_mels)
class MelFeature2():
'''
参考该博客的实现,较为简单,但是不容易控制特征长度
https://www.kaggle.com/ybonde/log-spectrogram-and-mfcc-filter-bank-example
'''
def __init__(self,sr = 16000,window_size = 20,step_size = 10):
self.sr = sr
self.nperseg = int(round(window_size * sr / 1e3))
self.noverlap = int(round(step_size * sr / 1e3))
def batch_mfcc(self,x):
features = list(map(lambda x: self.mfcc(x), x))
return features
def __call__(self, x):
return self.batch_mfcc(x)
def mfcc(self,x):
x = x/np.max(np.abs(x))
_, _, spec = signal.spectrogram(x, fs=self.sr,
window='hann',
nperseg=self.nperseg, noverlap=self.noverlap,
detrend=False)
eps = 1e-10
return np.log(spec.T.astype(np.float32) + eps)
class MelFeature3():
'''
参考该博客的实现,博客介绍的较为具体,可以参考
https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
'''
def __init__(self, sr=16000, window_size=20, step_size=10,pre_emphasis = 0.97):
self.sr = sr
self.window_size = window_size
self.step_size = step_size
self.nperseg = int(round(window_size * sr / 1e3))
self.noverlap = int(round(step_size * sr / 1e3))
self.pre_emphasis = pre_emphasis
self.frame_size = 0.025
self.frame_stride = 0.01
self.frame_length = self.frame_size * self.sr # Convert from seconds to samples
self.frame_step = self.frame_stride * self.sr
self.frame_length = int(round(self.frame_length))
self.frame_step = int(round(self.frame_step))
self.NFFT = 512 # typically 256 or 512
self.nfilt = 40
self.fbank = np.zeros((self.nfilt, int(np.floor(self.NFFT / 2 + 1))))
self._initial_filter_bank()
def batch_mfcc(self, x):
features = list(map(lambda x: self.mfcc(x), x))
return features
def _initial_filter_bank(self):
low_freq_mel = 0
high_freq_mel = (2595 * np.log10(1 + (self.sr / 2) / 700)) # Convert Hz to Mel
mel_points = np.linspace(low_freq_mel, high_freq_mel, self.nfilt + 2) # Equally spaced in Mel scale
hz_points = (700 * (10 ** (mel_points / 2595) - 1)) # Convert Mel to Hz
bin = np.floor((self.NFFT + 1) * hz_points / self.sr)
for m in range(1, self.nfilt + 1):
f_m_minus = int(bin[m - 1]) # left
f_m = int(bin[m]) # center
f_m_plus = int(bin[m + 1]) # right
for k in range(f_m_minus, f_m):
self.fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
for k in range(f_m, f_m_plus):
self.fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
def __call__(self, x):
return self.batch_mfcc(x)
def _pre_emphasis(self, x):
emphasized_signal = np.append(x[0], x[1:] - self.pre_emphasis * x[:-1])
return emphasized_signal
def _framing(self,x):
'''
:param x: audio signal after self._pre_emphasis(x)
:return:
'''
signal_length = len(x)
num_frames = int(np.ceil(float(np.abs(signal_length - self.frame_length)) / self.frame_step)) # Make sure that we have at least 1 frame
pad_signal_length = num_frames * self.frame_step + self.frame_length
z = np.zeros((pad_signal_length - signal_length))
# Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
pad_signal = np.append(x,z)
indices = np.tile(np.arange(0, self.frame_length), (num_frames, 1)) + np.tile(
np.arange(0, num_frames * self.frame_step, self.frame_step), (self.frame_length, 1)).T
frames = pad_signal[indices.astype(np.int32, copy=False)]
return frames
def _ftt(self,x):
'''
:param x: audio signal after self._framing(x)
:return:
'''
x *= np.hamming(self.frame_length)
mag_frames = np.absolute(np.fft.rfft(x, self.NFFT)) # Magnitude of the FFT
pow_frames = ((1.0 / self.NFFT) * ((mag_frames) ** 2)) # Power Spectrum
return pow_frames
def _filter_bank(self,x):
'''
:param x: after self._ftt(x)
:return:
'''
filter_banks = np.dot(x, self.fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # Numerical Stability
filter_banks = 20 * np.log10(filter_banks) # dB
return filter_banks.T
def mfcc(self, x):
'''
:param x: [batch,feature_dim,time_stamp]
:return:
'''
ex_flow = [self._pre_emphasis,self._framing,self._ftt,self._filter_bank ]
for ifun in ex_flow:
x = ifun(x)
return x
@kokokkko
Copy link

Well,cool!!!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment