Forward and Backward NSGCQ transforms. Complex values are encoded in 4 channes: real part magnitude, imag part magnitude, real part sign and imag part sign. This representation seems suitable for feeding NNs while allowing perfect reconstruction.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
import essentia.pytools.spectral as sp | |
from essentia.standard import MonoLoader, MonoWriter | |
from essentia import lin2db | |
PARAMS = { | |
'frameSize': 2 ** 14, | |
'sampleRate': 44100, | |
'minFrequency': 40, | |
'maxFrequency': 15000, | |
'binsPerOctave': 24, | |
'normalize': 'sine', | |
'transitionSize': 256 | |
} | |
def SNR(r, t, skip=8192): | |
""" | |
r : reference | |
t : test | |
skip : number of samples to skip from the SNR computation | |
""" | |
difference = ((r[skip: -skip] - t[skip: -skip]) ** 2).sum() | |
return lin2db((r[skip: -skip] ** 2).sum() / difference) | |
def cqt(audio): | |
"""Forward transform. Takes audio at the sample rate specified in PARAMS. | |
Returns a 4D vector (n_frames, channels, frame_size, n_bins) | |
- n_frames depends on the audio length. | |
- there are 4 channels: real part, imag part, real part sign, imag part sign. | |
- frame_size: duration of each frame in samples. e.g., 2 ** 14 / 44100 = 0.38s | |
- n_bins: number of frequency bins. Can be controled with binsPerOctave, min | |
and max frequencies. | |
""" | |
cq_frames, dc_frames, nb_frames = sp.nsgcqgram(x, **PARAMS) | |
n_frames = len(cq_frames) | |
stack = np.empty([n_frames, 4, cq_frames[0].shape[0], cq_frames[0].shape[1]]) | |
for f in range(n_frames): | |
stack[f, 0] = np.log10(10000. * abs(cq_frames[f].real) + 1.) # ch.0 | |
stack[f, 1] = np.log10(10000. * abs(cq_frames[f].imag) + 1.) # ch.1 | |
stack[f, 2] = np.array(cq_frames[f].real < 0, dtype=int) # ch.2 | |
stack[f, 3] = np.array(cq_frames[f].imag < 0, dtype=int) # ch.3 | |
return stack, dc_frames, nb_frames | |
def icqt(cq_frames, dc_frames=None, nb_frames=None): | |
""" Backward transform. Takes cqt frames and returns audio. | |
Optionally DC and Nyquist bands that are outside the analysis range | |
can be added. | |
""" | |
stack = [] | |
nbins = cq_frames.shape[1] // 2 | |
stack = [] | |
for f in range(cq_frames.shape[0]): | |
iscqt = cq_frames[f] | |
iscqt_real = (10 ** iscqt[0] - 1.) / 10000. | |
iscqt_real[np.where(iscqt[2])] *= -1 | |
iscqt_imag = (10 ** iscqt[1] - 1.) / 10000. | |
iscqt_imag[np.where(iscqt[3])] *= -1 | |
stack.append(np.array(iscqt_real + 1j * iscqt_imag, dtype='complex64')) | |
if not dc_frames: | |
dc_frames = [np.zeros(30).astype('complex64')] * cq_frames.shape[0] | |
if not nb_frames: | |
nb_frames = [np.zeros(5308).astype('complex64')] * cq_frames.shape[0] | |
y_frames = sp.nsgicqgram(stack, dc_frames, nb_frames, **PARAMS) | |
return y_frames | |
if __name__ == "__main__": | |
x = MonoLoader(filename='/home/pablo/vignesh.wav')() | |
cq_frames, dc_frames, nb_frames = cqt(x) | |
y_frames = icqt(cq_frames, dc_frames, nb_frames) | |
cq_snr = SNR(x, y_frames[:x.size]) | |
print('Reconstruction w out of range data SNR: {:.3f} dB'.format(cq_snr)) | |
y_frames = icqt(cq_frames) | |
cq_snr = SNR(x, y_frames[:x.size]) | |
print('Reconstruction w/o out of range data SNR: {:.3f} dB'.format(cq_snr)) | |
plt.matshow(cq_frames[5, 1]), plt.colorbar(), plt.show() | |
plt.matshow(cq_frames[5, 3]), plt.colorbar(), plt.show() | |
# Write reconstructed example | |
# MonoWriter(filename='reconstructed.wav')(y_frames) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment