RuolinZheng08/mfcc.py

## mfcc.py
from IPython.display import Audio

import numpy as np
from numpy.fft import fft, ifft

from scipy.fftpack import dct, idct
from scipy.signal import stft

import soundfile as sf

from copy import deepcopy

freq2mel = lambda f: 2595. * np.log10(1 + f / 700.)
mel2freq = lambda m: 700. * (10**(m / 2595.) - 1)

def pre_emphasis(x):
    """
    Applies pre-emphasis step to the signal.
    - balance frequencies in spectrum by increasing amplitude of high frequency
    bands and decreasing the amplitudes of lower bands
    - largely unnecessary in modern feature extraction pipelines
    ------
    :in:
    x, array of samples
    ------
    :out:
    y, array of samples
    """
    y = np.append(x[0], x[1:] - 0.97 * x[:-1])

    return y

def hamming(n):
    """
    Hamming method for weighting components of window.
    Feel free to implement more window functions.
    ------
    :in:
    n, window size
    ------
    :out:
    win, array of weights to apply along window
    """
    win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(n) / (n - 1))

    return win

def windowing(x, size, step):
    """
    Window and stack signal into overlapping frames.
    ------
    :in:
    x, array of samples
    size, window size in number of samples (Note: this may need to be a power of 2)
    step, window shift in number of samples
    ------
    :out:
    frames, 2d-array of frames with shape (number of windows, window size)
    """
    xpad = np.append(x, np.zeros((size - len(x) % size)))

    T = (len(xpad) - size) // step
    frames = np.stack([xpad[t * step:t * step + size] for t in range(T)])

    return frames

def discrete_fourier_transform(x):
    """
    Compute the discrete fourier transform for each frame of windowed signal x.
    Typically, we talk about performing the DFT on short-time windows
    (often referred to as the Short-Time Fourier Transform). Here, the input
    is a 2d-array with shape (window size,  number of windows). We want to
    perform the DFT on each of these windows.
    Note: this can be done in a vectorized form or in a loop.
    --------
    :in:
    x, 2d-array of frames with shape (window size, number of windows)
    --------
    :out:
    X, 2d-array of complex spectrum after DFT applied to each window of x
    """
    n = len(x)
    indices = np.arange(n)
    M = np.exp(-2j * np.pi * np.outer(indices, indices) / n)
    return np.dot(M, x)


def fast_fourier_transform(x):
    """
    Fast-fourier transform. Effiicient algorithm for computing the DFT.
    --------
    :in:
    x, 2d-array of frames with shape (window size, number of windows)
    --------
    :out:
    X, 2d-array of complex spectrum after DFT applied to each window of x
    """
    fft_size = len(x)

    if fft_size <= 16:
        X = discrete_fourier_transform(x)

    else:
        indices = np.arange(fft_size)
        even = fast_fourier_transform(x[::2])
        odd = fast_fourier_transform(x[1::2])
        m = np.exp(-2j * np.pi * indices / fft_size).reshape(-1, 1)
        X = np.concatenate([even + m[:fft_size // 2] * odd, even + m[fft_size // 2:] * odd])

    return X


def mel_filterbank(nfilters, fft_size, sample_rate):
    """
    Mel-warping filterbank.
    You do not need to edit this code; it is needed to contruct the mel filterbank
    which we will use to extract features.
    --------
    :in:
    nfilters, number of filters
    fft_size, window size over which fft is performed
    sample_rate, sampling rate of signal
    --------
    :out:
    mel_filter, 2d-array of (fft_size / 2, nfilters) used to get mel features
    mel_inv_filter, 2d-array of (nfilters, fft_size / 2) used to invert
    melpoints, 1d-array of frequencies converted to mel-scale
    """
    freq2mel = lambda f: 2595. * np.log10(1 + f / 700.)
    mel2freq = lambda m: 700. * (10**(m / 2595.) - 1)

    lowfreq = 0
    highfreq = sample_rate // 2

    lowmel = freq2mel(lowfreq)
    highmel = freq2mel(highfreq)

    melpoints = np.linspace(lowmel, highmel, 1 + nfilters + 1)

    # must convert from freq to fft bin number
    fft_bins = ((fft_size + 1) * mel2freq(melpoints) // sample_rate).astype(np.int32)

    filterbank = np.zeros((nfilters, fft_size // 2))
    for j in range(nfilters):
        for i in range(fft_bins[j], fft_bins[j + 1]):
            filterbank[j, i] = (i - fft_bins[j]) / (fft_bins[j + 1] - fft_bins[j])
        for i in range(fft_bins[j + 1], fft_bins[j + 2]):
            filterbank[j, i] = (fft_bins[j + 2] - i) / (fft_bins[j + 2] - fft_bins[j + 1])

    mel_filter = filterbank.T / filterbank.sum(axis=1).clip(1e-16)
    mel_inv_filter = filterbank

    return mel_filter, mel_inv_filter, melpoints


def inv_spectrogram(X_s, size, step, n_iter=15):
    """
    Feel free to disregard this code. It is not necessary that
    you follow the code below, but it can be used to invert
    from the spectrogram (signal spectrum magnitude) back to the signal
    which can be helpful when qualitatively assessing the nature of
    compression into MFCC features.
    """

    def find_offset(a, b):
        corrs = np.convolve(a - a.mean(), b[::-1] - b.mean())
        corrs[:len(b) // 2] = -1e12
        corrs[-len(b) // 2:] = -1e12
        return corrs.argmax() - len(a)

    def iterate(X, iteration):
        T, n = X.shape
        size = n // 2

        x = np.zeros((T * step + size))
        window_sum = np.zeros((T * step + size))

        est_start = size // 2 - 1
        est_stop = est_start + size

        for t in range(T):
            x_start = t * step
            x_stop = x_start + size

            est = ifft(X[t].real + 0j if iteration == 0 else X[t]).real[::-1]
            if t > 0 and x_stop - step > x_start and est_stop - step > est_start:
                offset = find_offset(x[x_start:x_stop - step], est[est_start:est_stop - step])
            else:
                offset = 0

            x[x_start:x_stop] += est[est_start - offset:est_stop - offset] * hamming(size)
            window_sum[x_start:x_stop] += hamming(size)

        return x.real / window_sum.clip(1e-12)

    X_s = np.concatenate([X_s, X_s[:, ::-1]], axis=1)
    reg = np.max(X_s) / 1e8

    X_best = iterate(deepcopy(X_s), 0)
    for i in range(1, n_iter):
        X_best = windowing(X_best, size, step) * hamming(size)
        est = fast_fourier_transform(X_best.T).T
        phase = est / np.maximum(reg, np.abs(est))
        X_best = iterate(X_s * phase[:len(X_s)], i)

    return np.real(X_best)

  def display_audios(fname):
    signal, fs = sf.read(fname)
    size = 128 # window size for the FFT
    step = size // 2 # distance to slide along the window in time
    nfilters = 26 # number of mel frequency channels
    ncoeffs = 13 # number of cepstral coeffecients to keep

    # pre-emphasize signal
    pre_emphasized_signal = pre_emphasis(signal)

    # window signal
    frames = windowing(pre_emphasized_signal, size, step) * hamming(size)

    # compute complex spectrum
    spectrum = fast_fourier_transform(frames.T).T
    spectrum = spectrum[:, :size // 2] # only need to keep half since it's symmetric

    # compute spectrum magnitude (typically what is meant by spectrogram)
    magnitude = np.abs(spectrum)

    # get spectrum power
    power = magnitude**2 / size

    # Generate the mel filter and mel inverse filter
    mel_filter, mel_inv_filter, melpoints = mel_filterbank(nfilters, size, fs)

    # apply mel warping filters to power spectrum and take log10
    log_mel_fbank = np.log10(power.dot(mel_filter).clip(1e-16))

    # compute MFCCs using discrete cosine transform
    """
    Note: DCT is used to decompose a finite discrete-time vector
    into a sum of scaled-and-shifted (real-valued) cosine functions
    (this can be thought of similarly to the DFT); additionally,
    the DCT often has better compression qualities as its top coefficients
    tend to by largely decorrelated, which can improve our position when
    make modeling assumptions later on
    """
    mfccs = dct(log_mel_fbank, type=2, axis=1, norm='ortho')

    # keep subset of cepstral coefficients
    mfccs = mfccs[:,:ncoeffs]

    # invert from MFCCs back to waveform
    recovered_log_mel_fbank = idct(mfccs, type=2, n=nfilters, axis=1, norm='ortho')

    # exponentiate log and invert mel warping
    recovered_power = (10**recovered_log_mel_fbank).dot(mel_inv_filter)

    # invert mel warping of spectrogram
    recovered_magnitude = np.sqrt(recovered_power * size)
    recovered_signal = inv_spectrogram(recovered_magnitude, size, step)
    #(Note: preemphasis is not inverted in resynthesizing the speech)

    display(Audio(data=signal, rate=fs))
    display(Audio(data=pre_emphasis(signal), rate=fs))
    display(Audio(data=recovered_signal, rate=fs))
	from IPython.display import Audio

	import numpy as np
	from numpy.fft import fft, ifft

	from scipy.fftpack import dct, idct
	from scipy.signal import stft

	import soundfile as sf

	from copy import deepcopy

	freq2mel = lambda f: 2595. * np.log10(1 + f / 700.)
	mel2freq = lambda m: 700. * (10**(m / 2595.) - 1)

	def pre_emphasis(x):
	"""
	Applies pre-emphasis step to the signal.
	- balance frequencies in spectrum by increasing amplitude of high frequency
	bands and decreasing the amplitudes of lower bands
	- largely unnecessary in modern feature extraction pipelines
	------
	:in:
	x, array of samples
	------
	:out:
	y, array of samples
	"""
	y = np.append(x[0], x[1:] - 0.97 * x[:-1])

	return y

	def hamming(n):
	"""
	Hamming method for weighting components of window.
	Feel free to implement more window functions.
	------
	:in:
	n, window size
	------
	:out:
	win, array of weights to apply along window
	"""
	win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(n) / (n - 1))

	return win

	def windowing(x, size, step):
	"""
	Window and stack signal into overlapping frames.
	------
	:in:
	x, array of samples
	size, window size in number of samples (Note: this may need to be a power of 2)
	step, window shift in number of samples
	------
	:out:
	frames, 2d-array of frames with shape (number of windows, window size)
	"""
	xpad = np.append(x, np.zeros((size - len(x) % size)))

	T = (len(xpad) - size) // step
	frames = np.stack([xpad[t * step:t * step + size] for t in range(T)])

	return frames

	def discrete_fourier_transform(x):
	"""
	Compute the discrete fourier transform for each frame of windowed signal x.
	Typically, we talk about performing the DFT on short-time windows
	(often referred to as the Short-Time Fourier Transform). Here, the input
	is a 2d-array with shape (window size, number of windows). We want to
	perform the DFT on each of these windows.
	Note: this can be done in a vectorized form or in a loop.
	--------
	:in:
	x, 2d-array of frames with shape (window size, number of windows)
	--------
	:out:
	X, 2d-array of complex spectrum after DFT applied to each window of x
	"""
	n = len(x)
	indices = np.arange(n)
	M = np.exp(-2j * np.pi * np.outer(indices, indices) / n)
	return np.dot(M, x)


	def fast_fourier_transform(x):
	"""
	Fast-fourier transform. Effiicient algorithm for computing the DFT.
	--------
	:in:
	x, 2d-array of frames with shape (window size, number of windows)
	--------
	:out:
	X, 2d-array of complex spectrum after DFT applied to each window of x
	"""
	fft_size = len(x)

	if fft_size <= 16:
	X = discrete_fourier_transform(x)

	else:
	indices = np.arange(fft_size)
	even = fast_fourier_transform(x[::2])
	odd = fast_fourier_transform(x[1::2])
	m = np.exp(-2j * np.pi * indices / fft_size).reshape(-1, 1)
	X = np.concatenate([even + m[:fft_size // 2] * odd, even + m[fft_size // 2:] * odd])

	return X


	def mel_filterbank(nfilters, fft_size, sample_rate):
	"""
	Mel-warping filterbank.
	You do not need to edit this code; it is needed to contruct the mel filterbank
	which we will use to extract features.
	--------
	:in:
	nfilters, number of filters
	fft_size, window size over which fft is performed
	sample_rate, sampling rate of signal
	--------
	:out:
	mel_filter, 2d-array of (fft_size / 2, nfilters) used to get mel features
	mel_inv_filter, 2d-array of (nfilters, fft_size / 2) used to invert
	melpoints, 1d-array of frequencies converted to mel-scale
	"""
	freq2mel = lambda f: 2595. * np.log10(1 + f / 700.)
	mel2freq = lambda m: 700. * (10**(m / 2595.) - 1)

	lowfreq = 0
	highfreq = sample_rate // 2

	lowmel = freq2mel(lowfreq)
	highmel = freq2mel(highfreq)

	melpoints = np.linspace(lowmel, highmel, 1 + nfilters + 1)

	# must convert from freq to fft bin number
	fft_bins = ((fft_size + 1) * mel2freq(melpoints) // sample_rate).astype(np.int32)

	filterbank = np.zeros((nfilters, fft_size // 2))
	for j in range(nfilters):
	for i in range(fft_bins[j], fft_bins[j + 1]):
	filterbank[j, i] = (i - fft_bins[j]) / (fft_bins[j + 1] - fft_bins[j])
	for i in range(fft_bins[j + 1], fft_bins[j + 2]):
	filterbank[j, i] = (fft_bins[j + 2] - i) / (fft_bins[j + 2] - fft_bins[j + 1])

	mel_filter = filterbank.T / filterbank.sum(axis=1).clip(1e-16)
	mel_inv_filter = filterbank

	return mel_filter, mel_inv_filter, melpoints


	def inv_spectrogram(X_s, size, step, n_iter=15):
	"""
	Feel free to disregard this code. It is not necessary that
	you follow the code below, but it can be used to invert
	from the spectrogram (signal spectrum magnitude) back to the signal
	which can be helpful when qualitatively assessing the nature of
	compression into MFCC features.
	"""

	def find_offset(a, b):
	corrs = np.convolve(a - a.mean(), b[::-1] - b.mean())
	corrs[:len(b) // 2] = -1e12
	corrs[-len(b) // 2:] = -1e12
	return corrs.argmax() - len(a)

	def iterate(X, iteration):
	T, n = X.shape
	size = n // 2

	x = np.zeros((T * step + size))
	window_sum = np.zeros((T * step + size))

	est_start = size // 2 - 1
	est_stop = est_start + size

	for t in range(T):
	x_start = t * step
	x_stop = x_start + size

	est = ifft(X[t].real + 0j if iteration == 0 else X[t]).real[::-1]
	if t > 0 and x_stop - step > x_start and est_stop - step > est_start:
	offset = find_offset(x[x_start:x_stop - step], est[est_start:est_stop - step])
	else:
	offset = 0

	x[x_start:x_stop] += est[est_start - offset:est_stop - offset] * hamming(size)
	window_sum[x_start:x_stop] += hamming(size)

	return x.real / window_sum.clip(1e-12)

	X_s = np.concatenate([X_s, X_s[:, ::-1]], axis=1)
	reg = np.max(X_s) / 1e8

	X_best = iterate(deepcopy(X_s), 0)
	for i in range(1, n_iter):
	X_best = windowing(X_best, size, step) * hamming(size)
	est = fast_fourier_transform(X_best.T).T
	phase = est / np.maximum(reg, np.abs(est))
	X_best = iterate(X_s * phase[:len(X_s)], i)

	return np.real(X_best)

	def display_audios(fname):
	signal, fs = sf.read(fname)
	size = 128 # window size for the FFT
	step = size // 2 # distance to slide along the window in time
	nfilters = 26 # number of mel frequency channels
	ncoeffs = 13 # number of cepstral coeffecients to keep

	# pre-emphasize signal
	pre_emphasized_signal = pre_emphasis(signal)

	# window signal
	frames = windowing(pre_emphasized_signal, size, step) * hamming(size)

	# compute complex spectrum
	spectrum = fast_fourier_transform(frames.T).T
	spectrum = spectrum[:, :size // 2] # only need to keep half since it's symmetric

	# compute spectrum magnitude (typically what is meant by spectrogram)
	magnitude = np.abs(spectrum)

	# get spectrum power
	power = magnitude**2 / size

	# Generate the mel filter and mel inverse filter
	mel_filter, mel_inv_filter, melpoints = mel_filterbank(nfilters, size, fs)

	# apply mel warping filters to power spectrum and take log10
	log_mel_fbank = np.log10(power.dot(mel_filter).clip(1e-16))

	# compute MFCCs using discrete cosine transform
	"""
	Note: DCT is used to decompose a finite discrete-time vector
	into a sum of scaled-and-shifted (real-valued) cosine functions
	(this can be thought of similarly to the DFT); additionally,
	the DCT often has better compression qualities as its top coefficients
	tend to by largely decorrelated, which can improve our position when
	make modeling assumptions later on
	"""
	mfccs = dct(log_mel_fbank, type=2, axis=1, norm='ortho')

	# keep subset of cepstral coefficients
	mfccs = mfccs[:,:ncoeffs]

	# invert from MFCCs back to waveform
	recovered_log_mel_fbank = idct(mfccs, type=2, n=nfilters, axis=1, norm='ortho')

	# exponentiate log and invert mel warping
	recovered_power = (10**recovered_log_mel_fbank).dot(mel_inv_filter)

	# invert mel warping of spectrogram
	recovered_magnitude = np.sqrt(recovered_power * size)
	recovered_signal = inv_spectrogram(recovered_magnitude, size, step)
	#(Note: preemphasis is not inverted in resynthesizing the speech)

	display(Audio(data=signal, rate=fs))
	display(Audio(data=pre_emphasis(signal), rate=fs))
	display(Audio(data=recovered_signal, rate=fs))