Skip to content

Instantly share code, notes, and snippets.

@palonso
Last active November 14, 2019 17:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save palonso/dfcc8ab57b1d9ecf14bcfe14f0ef322d to your computer and use it in GitHub Desktop.
Save palonso/dfcc8ab57b1d9ecf14bcfe14f0ef322d to your computer and use it in GitHub Desktop.
Forward and Backward NSGCQ transforms. Complex values are encoded in 4 channes: real part magnitude, imag part magnitude, real part sign and imag part sign. This representation seems suitable for feeding NNs while allowing perfect reconstruction.
import numpy as np
import matplotlib.pyplot as plt
import essentia.pytools.spectral as sp
from essentia.standard import MonoLoader, MonoWriter
from essentia import lin2db
PARAMS = {
'frameSize': 2 ** 14,
'sampleRate': 44100,
'minFrequency': 40,
'maxFrequency': 15000,
'binsPerOctave': 24,
'normalize': 'sine',
'transitionSize': 256
}
def SNR(r, t, skip=8192):
"""
r : reference
t : test
skip : number of samples to skip from the SNR computation
"""
difference = ((r[skip: -skip] - t[skip: -skip]) ** 2).sum()
return lin2db((r[skip: -skip] ** 2).sum() / difference)
def cqt(audio):
"""Forward transform. Takes audio at the sample rate specified in PARAMS.
Returns a 4D vector (n_frames, channels, frame_size, n_bins)
- n_frames depends on the audio length.
- there are 4 channels: real part, imag part, real part sign, imag part sign.
- frame_size: duration of each frame in samples. e.g., 2 ** 14 / 44100 = 0.38s
- n_bins: number of frequency bins. Can be controled with binsPerOctave, min
and max frequencies.
"""
cq_frames, dc_frames, nb_frames = sp.nsgcqgram(x, **PARAMS)
n_frames = len(cq_frames)
stack = np.empty([n_frames, 4, cq_frames[0].shape[0], cq_frames[0].shape[1]])
for f in range(n_frames):
stack[f, 0] = np.log10(10000. * abs(cq_frames[f].real) + 1.) # ch.0
stack[f, 1] = np.log10(10000. * abs(cq_frames[f].imag) + 1.) # ch.1
stack[f, 2] = np.array(cq_frames[f].real < 0, dtype=int) # ch.2
stack[f, 3] = np.array(cq_frames[f].imag < 0, dtype=int) # ch.3
return stack, dc_frames, nb_frames
def icqt(cq_frames, dc_frames=None, nb_frames=None):
""" Backward transform. Takes cqt frames and returns audio.
Optionally DC and Nyquist bands that are outside the analysis range
can be added.
"""
stack = []
nbins = cq_frames.shape[1] // 2
stack = []
for f in range(cq_frames.shape[0]):
iscqt = cq_frames[f]
iscqt_real = (10 ** iscqt[0] - 1.) / 10000.
iscqt_real[np.where(iscqt[2])] *= -1
iscqt_imag = (10 ** iscqt[1] - 1.) / 10000.
iscqt_imag[np.where(iscqt[3])] *= -1
stack.append(np.array(iscqt_real + 1j * iscqt_imag, dtype='complex64'))
if not dc_frames:
dc_frames = [np.zeros(30).astype('complex64')] * cq_frames.shape[0]
if not nb_frames:
nb_frames = [np.zeros(5308).astype('complex64')] * cq_frames.shape[0]
y_frames = sp.nsgicqgram(stack, dc_frames, nb_frames, **PARAMS)
return y_frames
if __name__ == "__main__":
x = MonoLoader(filename='/home/pablo/vignesh.wav')()
cq_frames, dc_frames, nb_frames = cqt(x)
y_frames = icqt(cq_frames, dc_frames, nb_frames)
cq_snr = SNR(x, y_frames[:x.size])
print('Reconstruction w out of range data SNR: {:.3f} dB'.format(cq_snr))
y_frames = icqt(cq_frames)
cq_snr = SNR(x, y_frames[:x.size])
print('Reconstruction w/o out of range data SNR: {:.3f} dB'.format(cq_snr))
plt.matshow(cq_frames[5, 1]), plt.colorbar(), plt.show()
plt.matshow(cq_frames[5, 3]), plt.colorbar(), plt.show()
# Write reconstructed example
# MonoWriter(filename='reconstructed.wav')(y_frames)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment