MechCoder/audio_utils.py

## audio_utils.py
import os

import numpy as np
import scipy.io.wavfile as wav


def convert_mp3_to_wav(folder, sampling_freq=44.1):
    """
    Converts a directory with mp3 files to wav files.

    Arguments
    ---------
    folder - A folder with all .mp3 files with no spaces
             in between the
    sampling freq - No of samples needed per second.
    """
    for filename in os.listdir(folder):
        if filename.endswith(".mp3"):
            from_path = os.path.join(folder, filename)
            mono_path = from_path[:-4] + "mono.mp3"
            wav_path = os.path.join(folder, from_path[:-3] + "wav")

            # Converts stereo to mono
            os.system("lame -a -m m %s %s" % (from_path, mono_path))

            # Converts mono to wav with the given sampling frequency.
            os.system(
                "lame --decode %s %s --resample %s" %
                (mono_path, wav_path, str(sampling_freq)))


def time_blocks_to_fft_blocks(time_blocks):
    """
    Converts arrays from the time domain to the frequency domain.
    """
    fft_blocks = []
    for block in time_blocks:
        fft_block = np.fft.fft(block)
        new_block = np.concatenate((np.real(fft_block), np.imag(fft_block)))
        fft_blocks.append(new_block)
    return np.array(fft_blocks)


def convert_wav_to_freq_domain(wavfile, block_size=None):
    """
    Converts a wavefile to the frequency domain that can be input to a RNN.

    Arguments
    ---------
    wavfile - A file with extension .wav

    block_size - Size of a block
        If not provided, set to frequency / 4.

    Returns
    -------
    x_freq- shape=(_, 2*block_size)
        Returns x_freq with second dimension of shape block_size.

    y_freq- shape=(_, 2*block_size)
        Returns x_freq but shifted by 1 unit.
    """
    freq, amplitude = wav.read(wavfile)
    amplitude = np.asarray(amplitude, dtype=np.float32)

    # Normalize to [-1, 1]
    amplitude /= 32678.0

    if block_size is None:
        block_size = freq // 4

    # Split into sub-arrays of size block_size:
    split_indices = np.arange(block_size, amplitude.shape[0], block_size)
    block_arrays = np.split(amplitude, split_indices)

    last_shape = len(block_arrays[-1])
    padded = np.zeros(block_size)
    padded[:last_shape] = block_arrays[-1]
    block_arrays[-1] = padded

    x = block_arrays
    y = block_arrays[1:]
    y.append(np.zeros(block_size))

    x_freq = time_blocks_to_fft_blocks(x)
    y_freq = time_blocks_to_fft_blocks(y)
    return x_freq, y_freq


def generate_tensors(directory, max_seq_len=40, block_size=None):
    """
    Generate tensors of shape (num_examples, max_seq_len, block_size)
    where seq_length is the number of time steps.

    Arguments
    ---------
    directory - A directory that has .wav files in it

    max_seq_len - If you want a clip of x seconds set this to
    (block_size / sample_frequency * x)
    """
    X_chunks = []
    y_chunks = []
    for wav_file in os.listdir(directory):
        if wav_file.endswith(".wav"):
            x_freq, y_freq = convert_wav_to_freq_domain(
                os.path.join(directory, wav_file))
            splits = np.arange(max_seq_len, x_freq.shape[0], max_seq_len)

            # Split x_freq into 2-D arrays of size (max_seq_len, block_size)
            x_freq_chunks = np.split(x_freq, splits, axis=0)[:-1]
            print(np.asarray(x_freq_chunks).shape)
            X_chunks.extend(x_freq_chunks)

            y_freq_chunks = np.split(y_freq, splits, axis=0)[:-1]
            print(np.asarray(y_freq_chunks).shape)
            y_chunks.extend(y_freq_chunks)

    return np.asarray(X_chunks), np.asarray(y_chunks)
	import os

	import numpy as np
	import scipy.io.wavfile as wav


	def convert_mp3_to_wav(folder, sampling_freq=44.1):
	"""
	Converts a directory with mp3 files to wav files.

	Arguments
	---------
	folder - A folder with all .mp3 files with no spaces
	in between the
	sampling freq - No of samples needed per second.
	"""
	for filename in os.listdir(folder):
	if filename.endswith(".mp3"):
	from_path = os.path.join(folder, filename)
	mono_path = from_path[:-4] + "mono.mp3"
	wav_path = os.path.join(folder, from_path[:-3] + "wav")

	# Converts stereo to mono
	os.system("lame -a -m m %s %s" % (from_path, mono_path))

	# Converts mono to wav with the given sampling frequency.
	os.system(
	"lame --decode %s %s --resample %s" %
	(mono_path, wav_path, str(sampling_freq)))


	def time_blocks_to_fft_blocks(time_blocks):
	"""
	Converts arrays from the time domain to the frequency domain.
	"""
	fft_blocks = []
	for block in time_blocks:
	fft_block = np.fft.fft(block)
	new_block = np.concatenate((np.real(fft_block), np.imag(fft_block)))
	fft_blocks.append(new_block)
	return np.array(fft_blocks)


	def convert_wav_to_freq_domain(wavfile, block_size=None):
	"""
	Converts a wavefile to the frequency domain that can be input to a RNN.

	Arguments
	---------
	wavfile - A file with extension .wav

	block_size - Size of a block
	If not provided, set to frequency / 4.

	Returns
	-------
	x_freq- shape=(_, 2*block_size)
	Returns x_freq with second dimension of shape block_size.

	y_freq- shape=(_, 2*block_size)
	Returns x_freq but shifted by 1 unit.
	"""
	freq, amplitude = wav.read(wavfile)
	amplitude = np.asarray(amplitude, dtype=np.float32)

	# Normalize to [-1, 1]
	amplitude /= 32678.0

	if block_size is None:
	block_size = freq // 4

	# Split into sub-arrays of size block_size:
	split_indices = np.arange(block_size, amplitude.shape[0], block_size)
	block_arrays = np.split(amplitude, split_indices)

	last_shape = len(block_arrays[-1])
	padded = np.zeros(block_size)
	padded[:last_shape] = block_arrays[-1]
	block_arrays[-1] = padded

	x = block_arrays
	y = block_arrays[1:]
	y.append(np.zeros(block_size))

	x_freq = time_blocks_to_fft_blocks(x)
	y_freq = time_blocks_to_fft_blocks(y)
	return x_freq, y_freq


	def generate_tensors(directory, max_seq_len=40, block_size=None):
	"""
	Generate tensors of shape (num_examples, max_seq_len, block_size)
	where seq_length is the number of time steps.

	Arguments
	---------
	directory - A directory that has .wav files in it

	max_seq_len - If you want a clip of x seconds set this to
	(block_size / sample_frequency * x)
	"""
	X_chunks = []
	y_chunks = []
	for wav_file in os.listdir(directory):
	if wav_file.endswith(".wav"):
	x_freq, y_freq = convert_wav_to_freq_domain(
	os.path.join(directory, wav_file))
	splits = np.arange(max_seq_len, x_freq.shape[0], max_seq_len)

	# Split x_freq into 2-D arrays of size (max_seq_len, block_size)
	x_freq_chunks = np.split(x_freq, splits, axis=0)[:-1]
	print(np.asarray(x_freq_chunks).shape)
	X_chunks.extend(x_freq_chunks)

	y_freq_chunks = np.split(y_freq, splits, axis=0)[:-1]
	print(np.asarray(y_freq_chunks).shape)
	y_chunks.extend(y_freq_chunks)

	return np.asarray(X_chunks), np.asarray(y_chunks)