kylemcdonald/ffmpeg_load_audio.py

## ffmpeg_load_audio.py
import numpy as np
import subprocess as sp
import os
DEVNULL = open(os.devnull, 'w')

# load_audio can not detect the input type
def ffmpeg_load_audio(filename, sr=44100, mono=False, normalize=True, in_type=np.int16, out_type=np.float32):
    channels = 1 if mono else 2
    format_strings = {
        np.float64: 'f64le',
        np.float32: 'f32le',
        np.int16: 's16le',
        np.int32: 's32le',
        np.uint32: 'u32le'
    }
    format_string = format_strings[in_type]
    command = [
        'ffmpeg',
        '-i', filename,
        '-f', format_string,
        '-acodec', 'pcm_' + format_string,
        '-ar', str(sr),
        '-ac', str(channels),
        '-']
    p = sp.Popen(command, stdout=sp.PIPE, stderr=DEVNULL, bufsize=4096)
    bytes_per_sample = np.dtype(in_type).itemsize
    frame_size = bytes_per_sample * channels
    chunk_size = frame_size * sr # read in 1-second chunks
    raw = b''
    with p.stdout as stdout:
        while True:
            data = stdout.read(chunk_size)
            if data:
                raw += data
            else:
                break
    audio = np.fromstring(raw, dtype=in_type).astype(out_type)
    if channels > 1:
        audio = audio.reshape((-1, channels)).transpose()
    if audio.size == 0:
        return audio, sr
    if issubclass(out_type, np.floating):
        if normalize:
            peak = np.abs(audio).max()
            if peak > 0:
                audio /= peak
        elif issubclass(in_type, np.integer):
            audio /= np.iinfo(in_type).max
    return audio, sr
	import numpy as np
	import subprocess as sp
	import os
	DEVNULL = open(os.devnull, 'w')

	# load_audio can not detect the input type
	def ffmpeg_load_audio(filename, sr=44100, mono=False, normalize=True, in_type=np.int16, out_type=np.float32):
	channels = 1 if mono else 2
	format_strings = {
	np.float64: 'f64le',
	np.float32: 'f32le',
	np.int16: 's16le',
	np.int32: 's32le',
	np.uint32: 'u32le'
	}
	format_string = format_strings[in_type]
	command = [
	'ffmpeg',
	'-i', filename,
	'-f', format_string,
	'-acodec', 'pcm_' + format_string,
	'-ar', str(sr),
	'-ac', str(channels),
	'-']
	p = sp.Popen(command, stdout=sp.PIPE, stderr=DEVNULL, bufsize=4096)
	bytes_per_sample = np.dtype(in_type).itemsize
	frame_size = bytes_per_sample * channels
	chunk_size = frame_size * sr # read in 1-second chunks
	raw = b''
	with p.stdout as stdout:
	while True:
	data = stdout.read(chunk_size)
	if data:
	raw += data
	else:
	break
	audio = np.fromstring(raw, dtype=in_type).astype(out_type)
	if channels > 1:
	audio = audio.reshape((-1, channels)).transpose()
	if audio.size == 0:
	return audio, sr
	if issubclass(out_type, np.floating):
	if normalize:
	peak = np.abs(audio).max()
	if peak > 0:
	audio /= peak
	elif issubclass(in_type, np.integer):
	audio /= np.iinfo(in_type).max
	return audio, sr