Created
June 21, 2019 06:36
-
-
Save sailist/d35f3472718e00886380935cda8459f0 to your computer and use it in GitHub Desktop.
用于机器学习的MFCC特征提取方法,提供了对单个音频的特征提取和batch级的音频的特征提取
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from keras.layers import Layer | |
import numpy as np | |
from keras.layers import Lambda | |
from librosa.feature import melspectrogram | |
from keras import backend as K | |
from keras.preprocessing.sequence import pad_sequences | |
from sklearn.preprocessing import scale as skscale | |
from scipy import signal | |
class MelFeature(): | |
''' | |
该方法实现的Mel特征总觉得不对,停止使用 | |
改为使用MelFeature2/MelFeature3 | |
''' | |
def __init__(self,sr = 16000,n_fft=2048,hop_length=512,power=2.0,n_mels = 128,**kwargs): | |
self.sr = sr | |
self.n_fft = n_fft | |
self.hop_length = hop_length | |
self.power = power | |
self.n_mels = n_mels | |
def batch_mfcc(self, x): | |
''' | |
先提取特征 | |
:param x: | |
:param kwargs: | |
:return: | |
''' | |
features = list(map(lambda x: self.mfcc(x), x)) | |
return features | |
def __call__(self,x): | |
return self.batch_mfcc(x) | |
def mfcc(self,x): | |
''' | |
没有padding | |
:param x: | |
:return: | |
''' | |
x = x / np.max(np.abs(x)) # 音量归一化 | |
return melspectrogram(x, | |
sr=self.sr, | |
n_fft=self.n_fft, | |
hop_length=self.hop_length, | |
power=self.power, | |
n_mels = self.n_mels) | |
class MelFeature2(): | |
''' | |
参考该博客的实现,较为简单,但是不容易控制特征长度 | |
https://www.kaggle.com/ybonde/log-spectrogram-and-mfcc-filter-bank-example | |
''' | |
def __init__(self,sr = 16000,window_size = 20,step_size = 10): | |
self.sr = sr | |
self.nperseg = int(round(window_size * sr / 1e3)) | |
self.noverlap = int(round(step_size * sr / 1e3)) | |
def batch_mfcc(self,x): | |
features = list(map(lambda x: self.mfcc(x), x)) | |
return features | |
def __call__(self, x): | |
return self.batch_mfcc(x) | |
def mfcc(self,x): | |
x = x/np.max(np.abs(x)) | |
_, _, spec = signal.spectrogram(x, fs=self.sr, | |
window='hann', | |
nperseg=self.nperseg, noverlap=self.noverlap, | |
detrend=False) | |
eps = 1e-10 | |
return np.log(spec.T.astype(np.float32) + eps) | |
class MelFeature3(): | |
''' | |
参考该博客的实现,博客介绍的较为具体,可以参考 | |
https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html | |
''' | |
def __init__(self, sr=16000, window_size=20, step_size=10,pre_emphasis = 0.97): | |
self.sr = sr | |
self.window_size = window_size | |
self.step_size = step_size | |
self.nperseg = int(round(window_size * sr / 1e3)) | |
self.noverlap = int(round(step_size * sr / 1e3)) | |
self.pre_emphasis = pre_emphasis | |
self.frame_size = 0.025 | |
self.frame_stride = 0.01 | |
self.frame_length = self.frame_size * self.sr # Convert from seconds to samples | |
self.frame_step = self.frame_stride * self.sr | |
self.frame_length = int(round(self.frame_length)) | |
self.frame_step = int(round(self.frame_step)) | |
self.NFFT = 512 # typically 256 or 512 | |
self.nfilt = 40 | |
self.fbank = np.zeros((self.nfilt, int(np.floor(self.NFFT / 2 + 1)))) | |
self._initial_filter_bank() | |
def batch_mfcc(self, x): | |
features = list(map(lambda x: self.mfcc(x), x)) | |
return features | |
def _initial_filter_bank(self): | |
low_freq_mel = 0 | |
high_freq_mel = (2595 * np.log10(1 + (self.sr / 2) / 700)) # Convert Hz to Mel | |
mel_points = np.linspace(low_freq_mel, high_freq_mel, self.nfilt + 2) # Equally spaced in Mel scale | |
hz_points = (700 * (10 ** (mel_points / 2595) - 1)) # Convert Mel to Hz | |
bin = np.floor((self.NFFT + 1) * hz_points / self.sr) | |
for m in range(1, self.nfilt + 1): | |
f_m_minus = int(bin[m - 1]) # left | |
f_m = int(bin[m]) # center | |
f_m_plus = int(bin[m + 1]) # right | |
for k in range(f_m_minus, f_m): | |
self.fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1]) | |
for k in range(f_m, f_m_plus): | |
self.fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m]) | |
def __call__(self, x): | |
return self.batch_mfcc(x) | |
def _pre_emphasis(self, x): | |
emphasized_signal = np.append(x[0], x[1:] - self.pre_emphasis * x[:-1]) | |
return emphasized_signal | |
def _framing(self,x): | |
''' | |
:param x: audio signal after self._pre_emphasis(x) | |
:return: | |
''' | |
signal_length = len(x) | |
num_frames = int(np.ceil(float(np.abs(signal_length - self.frame_length)) / self.frame_step)) # Make sure that we have at least 1 frame | |
pad_signal_length = num_frames * self.frame_step + self.frame_length | |
z = np.zeros((pad_signal_length - signal_length)) | |
# Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal | |
pad_signal = np.append(x,z) | |
indices = np.tile(np.arange(0, self.frame_length), (num_frames, 1)) + np.tile( | |
np.arange(0, num_frames * self.frame_step, self.frame_step), (self.frame_length, 1)).T | |
frames = pad_signal[indices.astype(np.int32, copy=False)] | |
return frames | |
def _ftt(self,x): | |
''' | |
:param x: audio signal after self._framing(x) | |
:return: | |
''' | |
x *= np.hamming(self.frame_length) | |
mag_frames = np.absolute(np.fft.rfft(x, self.NFFT)) # Magnitude of the FFT | |
pow_frames = ((1.0 / self.NFFT) * ((mag_frames) ** 2)) # Power Spectrum | |
return pow_frames | |
def _filter_bank(self,x): | |
''' | |
:param x: after self._ftt(x) | |
:return: | |
''' | |
filter_banks = np.dot(x, self.fbank.T) | |
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # Numerical Stability | |
filter_banks = 20 * np.log10(filter_banks) # dB | |
return filter_banks.T | |
def mfcc(self, x): | |
''' | |
:param x: [batch,feature_dim,time_stamp] | |
:return: | |
''' | |
ex_flow = [self._pre_emphasis,self._framing,self._ftt,self._filter_bank ] | |
for ifun in ex_flow: | |
x = ifun(x) | |
return x |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Well,cool!!!