Last active January 31, 2021 20:47
[Python] Mel-frequency Cepstrum
from IPython.display import Audio
import numpy as np
from numpy.fft import fft, ifft
from scipy.fftpack import dct, idct
from scipy.signal import stft
import soundfile as sf
from copy import deepcopy
freq2mel = lambda f: 2595. * np.log10(1 + f / 700.)
mel2freq = lambda m: 700. * (10**(m / 2595.) - 1)
def pre_emphasis(x):
Applies pre-emphasis step to the signal.
- balance frequencies in spectrum by increasing amplitude of high frequency
bands and decreasing the amplitudes of lower bands
- largely unnecessary in modern feature extraction pipelines
x, array of samples
y, array of samples
y = np.append(x[0], x[1:] - 0.97 * x[:-1])
return y
def hamming(n):
Hamming method for weighting components of window.
Feel free to implement more window functions.
n, window size
win, array of weights to apply along window
win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(n) / (n - 1))
return win
def windowing(x, size, step):
Window and stack signal into overlapping frames.
x, array of samples
size, window size in number of samples (Note: this may need to be a power of 2)
step, window shift in number of samples
frames, 2d-array of frames with shape (number of windows, window size)
xpad = np.append(x, np.zeros((size - len(x) % size)))
T = (len(xpad) - size) // step
frames = np.stack([xpad[t * step:t * step + size] for t in range(T)])
return frames
def discrete_fourier_transform(x):
Compute the discrete fourier transform for each frame of windowed signal x.
Typically, we talk about performing the DFT on short-time windows
(often referred to as the Short-Time Fourier Transform). Here, the input
is a 2d-array with shape (window size, number of windows). We want to
perform the DFT on each of these windows.
Note: this can be done in a vectorized form or in a loop.
x, 2d-array of frames with shape (window size, number of windows)
X, 2d-array of complex spectrum after DFT applied to each window of x
n = len(x)
indices = np.arange(n)
M = np.exp(-2j * np.pi * np.outer(indices, indices) / n)
return, x)
def fast_fourier_transform(x):
Fast-fourier transform. Effiicient algorithm for computing the DFT.
x, 2d-array of frames with shape (window size, number of windows)
X, 2d-array of complex spectrum after DFT applied to each window of x
fft_size = len(x)
if fft_size <= 16:
X = discrete_fourier_transform(x)
indices = np.arange(fft_size)
even = fast_fourier_transform(x[::2])
odd = fast_fourier_transform(x[1::2])
m = np.exp(-2j * np.pi * indices / fft_size).reshape(-1, 1)
X = np.concatenate([even + m[:fft_size // 2] * odd, even + m[fft_size // 2:] * odd])
return X
def mel_filterbank(nfilters, fft_size, sample_rate):
Mel-warping filterbank.
You do not need to edit this code; it is needed to contruct the mel filterbank
which we will use to extract features.
nfilters, number of filters
fft_size, window size over which fft is performed
sample_rate, sampling rate of signal
mel_filter, 2d-array of (fft_size / 2, nfilters) used to get mel features
mel_inv_filter, 2d-array of (nfilters, fft_size / 2) used to invert
melpoints, 1d-array of frequencies converted to mel-scale
freq2mel = lambda f: 2595. * np.log10(1 + f / 700.)
mel2freq = lambda m: 700. * (10**(m / 2595.) - 1)
lowfreq = 0
highfreq = sample_rate // 2
lowmel = freq2mel(lowfreq)
highmel = freq2mel(highfreq)
melpoints = np.linspace(lowmel, highmel, 1 + nfilters + 1)
# must convert from freq to fft bin number
fft_bins = ((fft_size + 1) * mel2freq(melpoints) // sample_rate).astype(np.int32)
filterbank = np.zeros((nfilters, fft_size // 2))
for j in range(nfilters):
for i in range(fft_bins[j], fft_bins[j + 1]):
filterbank[j, i] = (i - fft_bins[j]) / (fft_bins[j + 1] - fft_bins[j])
for i in range(fft_bins[j + 1], fft_bins[j + 2]):
filterbank[j, i] = (fft_bins[j + 2] - i) / (fft_bins[j + 2] - fft_bins[j + 1])
mel_filter = filterbank.T / filterbank.sum(axis=1).clip(1e-16)
mel_inv_filter = filterbank
return mel_filter, mel_inv_filter, melpoints
def inv_spectrogram(X_s, size, step, n_iter=15):
Feel free to disregard this code. It is not necessary that
you follow the code below, but it can be used to invert
from the spectrogram (signal spectrum magnitude) back to the signal
which can be helpful when qualitatively assessing the nature of
compression into MFCC features.
def find_offset(a, b):
corrs = np.convolve(a - a.mean(), b[::-1] - b.mean())
corrs[:len(b) // 2] = -1e12
corrs[-len(b) // 2:] = -1e12
return corrs.argmax() - len(a)
def iterate(X, iteration):
T, n = X.shape
size = n // 2
x = np.zeros((T * step + size))
window_sum = np.zeros((T * step + size))
est_start = size // 2 - 1
est_stop = est_start + size
for t in range(T):
x_start = t * step
x_stop = x_start + size
est = ifft(X[t].real + 0j if iteration == 0 else X[t]).real[::-1]
if t > 0 and x_stop - step > x_start and est_stop - step > est_start:
offset = find_offset(x[x_start:x_stop - step], est[est_start:est_stop - step])
offset = 0
x[x_start:x_stop] += est[est_start - offset:est_stop - offset] * hamming(size)
window_sum[x_start:x_stop] += hamming(size)
return x.real / window_sum.clip(1e-12)
X_s = np.concatenate([X_s, X_s[:, ::-1]], axis=1)
reg = np.max(X_s) / 1e8
X_best = iterate(deepcopy(X_s), 0)
for i in range(1, n_iter):
X_best = windowing(X_best, size, step) * hamming(size)
est = fast_fourier_transform(X_best.T).T
phase = est / np.maximum(reg, np.abs(est))
X_best = iterate(X_s * phase[:len(X_s)], i)
return np.real(X_best)
def display_audios(fname):
signal, fs =
size = 128 # window size for the FFT
step = size // 2 # distance to slide along the window in time
nfilters = 26 # number of mel frequency channels
ncoeffs = 13 # number of cepstral coeffecients to keep
# pre-emphasize signal
pre_emphasized_signal = pre_emphasis(signal)
# window signal
frames = windowing(pre_emphasized_signal, size, step) * hamming(size)
# compute complex spectrum
spectrum = fast_fourier_transform(frames.T).T
spectrum = spectrum[:, :size // 2] # only need to keep half since it's symmetric
# compute spectrum magnitude (typically what is meant by spectrogram)
magnitude = np.abs(spectrum)
# get spectrum power
power = magnitude**2 / size
# Generate the mel filter and mel inverse filter
mel_filter, mel_inv_filter, melpoints = mel_filterbank(nfilters, size, fs)
# apply mel warping filters to power spectrum and take log10
log_mel_fbank = np.log10(
# compute MFCCs using discrete cosine transform
Note: DCT is used to decompose a finite discrete-time vector
into a sum of scaled-and-shifted (real-valued) cosine functions
(this can be thought of similarly to the DFT); additionally,
the DCT often has better compression qualities as its top coefficients
tend to by largely decorrelated, which can improve our position when
make modeling assumptions later on
mfccs = dct(log_mel_fbank, type=2, axis=1, norm='ortho')
# keep subset of cepstral coefficients
mfccs = mfccs[:,:ncoeffs]
# invert from MFCCs back to waveform
recovered_log_mel_fbank = idct(mfccs, type=2, n=nfilters, axis=1, norm='ortho')
# exponentiate log and invert mel warping
recovered_power = (10**recovered_log_mel_fbank).dot(mel_inv_filter)
# invert mel warping of spectrogram
recovered_magnitude = np.sqrt(recovered_power * size)
recovered_signal = inv_spectrogram(recovered_magnitude, size, step)
#(Note: preemphasis is not inverted in resynthesizing the speech)
display(Audio(data=signal, rate=fs))
display(Audio(data=pre_emphasis(signal), rate=fs))
display(Audio(data=recovered_signal, rate=fs))
