angelovescio/pyaudio_to_deepspeech.py

## pyaudio_to_deepspeech.py
# download the pyaudio from https://github.com/intxcc/pyaudio_portaudio

import deepspeech
import numpy as np
import os
import pyaudio
import time
import samplerate as sr
import nnresample
from scipy import signal,io
import audioop
import wave

recorded_frames = []
recorded_frames_down = []

def downsampleWav(src, inrate=44100, outrate=16000, inchannels=1, outchannels=1):
    data = src
    retval = None
    try:
        converted = audioop.ratecv(data, 2, inchannels, inrate, outrate, None)
        if outchannels == 1 & inchannels != 1:
            converted[0] = audioop.tomono(converted[0], 2, 1, 0)
        recorded_frames_down.append(converted[0])
        retval = np.frombuffer(converted[0],dtype=np.int16)
    except:
        print('Failed to downsample wav')
        return retval

    return retval

# DeepSpeech parameters
DEEPSPEECH_MODEL_DIR = 'deepspeech-0.9.3-models'
MODEL_FILE_PATH = os.path.join(DEEPSPEECH_MODEL_DIR, 'deepspeech-0.9.3-models.pbmm')
BEAM_WIDTH = 500
LM_FILE_PATH = os.path.join(DEEPSPEECH_MODEL_DIR, 'lm.binary')
TRIE_FILE_PATH = os.path.join(DEEPSPEECH_MODEL_DIR, 'trie')
LM_ALPHA = 0.75
LM_BETA = 1.85

# Make DeepSpeech Model
model = deepspeech.Model(MODEL_FILE_PATH)
# model.enableDecoderWithLM(LM_FILE_PATH, TRIE_FILE_PATH, LM_ALPHA, LM_BETA)

# Create a Streaming session
context = model.createStream()

# Encapsulate DeepSpeech audio feeding into a callback for PyAudio
text_so_far = ''
def process_audio(in_data, frame_count, time_info, status):
    global text_so_far
    data16 = downsampleWav(in_data,int(device_info["defaultSampleRate"]),16000,channelcount,1)
    # data16 = np.frombuffer(resamp_data, dtype=np.int16)
    # data16 = np.frombuffer(in_data, dtype=np.int16)
    # print(data16)
    context.feedAudioContent(data16)
    text = context.intermediateDecode()
    if text != text_so_far:
        print('Interim text = {}'.format(text))
        text_so_far = text
    recorded_frames.append(in_data)
    return (in_data, pyaudio.paContinue)
defaultframes = 2048

class textcolors:
    if not os.name == 'nt':
        blue = '\033[94m'
        green = '\033[92m'
        warning = '\033[93m'
        fail = '\033[91m'
        end = '\033[0m'
    else:
        blue = ''
        green = ''
        warning = ''
        fail = ''
        end = ''

recorded_frames = []
device_info = {}
useloopback = False
recordtime = 5

#Use module
p = pyaudio.PyAudio()

#Set default to first in list or ask Windows
try:
    default_device_index = p.get_default_input_device_info()
except IOError:
    default_device_index = -1

#Select Device
print (textcolors.blue + "Available devices:\n" + textcolors.end)
for i in range(0, p.get_device_count()):
    info = p.get_device_info_by_index(i)
    print (textcolors.green + str(info["index"]) + textcolors.end + ": \t %s \n \t %s \n" % (info["name"], p.get_host_api_info_by_index(info["hostApi"])["name"]))

    if default_device_index == -1:
        default_device_index = info["index"]

#Handle no devices available
if default_device_index == -1:
    print (textcolors.fail + "No device available. Quitting." + textcolors.end)
    exit()

device_int = input("Choose device [" + textcolors.blue + str(default_device_index) + textcolors.end + "]: ")
#Get input or default
if device_int == '':
    device_int=int(default_device_index["index"])
    print(default_device_index)
    print("got it")
device_id = int(device_int)
print ("")

#Get device info
try:
    device_info = p.get_device_info_by_index(device_id)
except IOError:
    device_info = p.get_device_info_by_index(default_device_index)
    print (textcolors.warning + "Selection not available, using default." + textcolors.end)

#Choose between loopback or standard mode
is_input = device_info["maxInputChannels"] > 0
is_wasapi = (p.get_host_api_info_by_index(device_info["hostApi"])["name"]).find("WASAPI") != -1
if is_input:
    print (textcolors.blue + "Selection is input using standard mode.\n" + textcolors.end)
else:
    if is_wasapi:
        useloopback = True;
        print (textcolors.green + "Selection is output. Using loopback mode.\n" + textcolors.end)
    else:
        print (textcolors.fail + "Selection is input and does not support loopback mode. Quitting.\n" + textcolors.end)
        exit()

recordtime = int(input("Record time in seconds [" + textcolors.blue + str(recordtime) + textcolors.end + "]: ") or recordtime)

# WASAPI: IAudioClient2 set properties: IsOffload = 0, Category = 0, Options = 0
# wFormatTag     =WAVE_FORMAT_PCM
# nChannels      =2
# nSamplesPerSec =48000
# nAvgBytesPerSec=192000
# nBlockAlign    =4
# wBitsPerSample =16
# cbSize         =0
# WASAPI::OpenStream(input): framesPerUser[ 512 ] framesPerHost[ 1056 ] latency[ 22.00ms ] exclusive[ NO ] wow64_fix[ NO ] mode[ POLL ]
# WASAPI: thread[ priority-0xD class-0x20 ]

#Open stream
channelcount = device_info["maxInputChannels"] if (device_info["maxOutputChannels"] < device_info["maxInputChannels"]) else device_info["maxOutputChannels"]

stream = p.open(format = pyaudio.paInt16,
                channels = channelcount,
                rate = int(device_info["defaultSampleRate"]),
                input = True,
                frames_per_buffer = defaultframes,
                input_device_index = device_info["index"],
                stream_callback=process_audio,
                as_loopback = useloopback)

# start the stream
stream.start_stream()

#Start Recording
print (textcolors.blue + "Starting..." + textcolors.end)

try:
    while stream.is_active():
        time.sleep(0.1)
        recordtime = float(recordtime) - 0.1
        if recordtime <= 0:
            stream.stop_stream()
            stream.close()
            p.terminate()
            break
except KeyboardInterrupt:
    # PyAudio
    stream.stop_stream()
    stream.close()
    p.terminate()
print('Finished recording.')
# DeepSpeech
filename = input("Save as [" + textcolors.blue + "out.wav" + textcolors.end + "]: ") or "out.wav"

waveFile = wave.open(filename, 'wb')
waveFile.setnchannels(channelcount)
waveFile.setsampwidth(p.get_sample_size(pyaudio.paInt16))
waveFile.setframerate(int(device_info["defaultSampleRate"]))
waveFile.writeframes(b''.join(recorded_frames))
waveFile.close()
waveFile_down = wave.open(filename+".down.wav", 'wb')
waveFile_down.setnchannels(channelcount)
waveFile_down.setsampwidth(p.get_sample_size(pyaudio.paInt16))
waveFile_down.setframerate(16000)
waveFile_down.writeframes(b''.join(recorded_frames_down))
waveFile_down.close()
text = context.finishStream()
print('Final text = {}'.format(text))

print (textcolors.blue + "End." + textcolors.end)
	# download the pyaudio from https://github.com/intxcc/pyaudio_portaudio

	import deepspeech
	import numpy as np
	import os
	import pyaudio
	import time
	import samplerate as sr
	import nnresample
	from scipy import signal,io
	import audioop
	import wave

	recorded_frames = []
	recorded_frames_down = []

	def downsampleWav(src, inrate=44100, outrate=16000, inchannels=1, outchannels=1):
	data = src
	retval = None
	try:
	converted = audioop.ratecv(data, 2, inchannels, inrate, outrate, None)
	if outchannels == 1 & inchannels != 1:
	converted[0] = audioop.tomono(converted[0], 2, 1, 0)
	recorded_frames_down.append(converted[0])
	retval = np.frombuffer(converted[0],dtype=np.int16)
	except:
	print('Failed to downsample wav')
	return retval

	return retval

	# DeepSpeech parameters
	DEEPSPEECH_MODEL_DIR = 'deepspeech-0.9.3-models'
	MODEL_FILE_PATH = os.path.join(DEEPSPEECH_MODEL_DIR, 'deepspeech-0.9.3-models.pbmm')
	BEAM_WIDTH = 500
	LM_FILE_PATH = os.path.join(DEEPSPEECH_MODEL_DIR, 'lm.binary')
	TRIE_FILE_PATH = os.path.join(DEEPSPEECH_MODEL_DIR, 'trie')
	LM_ALPHA = 0.75
	LM_BETA = 1.85

	# Make DeepSpeech Model
	model = deepspeech.Model(MODEL_FILE_PATH)
	# model.enableDecoderWithLM(LM_FILE_PATH, TRIE_FILE_PATH, LM_ALPHA, LM_BETA)

	# Create a Streaming session
	context = model.createStream()

	# Encapsulate DeepSpeech audio feeding into a callback for PyAudio
	text_so_far = ''
	def process_audio(in_data, frame_count, time_info, status):
	global text_so_far
	data16 = downsampleWav(in_data,int(device_info["defaultSampleRate"]),16000,channelcount,1)
	# data16 = np.frombuffer(resamp_data, dtype=np.int16)
	# data16 = np.frombuffer(in_data, dtype=np.int16)
	# print(data16)
	context.feedAudioContent(data16)
	text = context.intermediateDecode()
	if text != text_so_far:
	print('Interim text = {}'.format(text))
	text_so_far = text
	recorded_frames.append(in_data)
	return (in_data, pyaudio.paContinue)
	defaultframes = 2048

	class textcolors:
	if not os.name == 'nt':
	blue = '\033[94m'
	green = '\033[92m'
	warning = '\033[93m'
	fail = '\033[91m'
	end = '\033[0m'
	else:
	blue = ''
	green = ''
	warning = ''
	fail = ''
	end = ''

	recorded_frames = []
	device_info = {}
	useloopback = False
	recordtime = 5

	#Use module
	p = pyaudio.PyAudio()

	#Set default to first in list or ask Windows
	try:
	default_device_index = p.get_default_input_device_info()
	except IOError:
	default_device_index = -1

	#Select Device
	print (textcolors.blue + "Available devices:\n" + textcolors.end)
	for i in range(0, p.get_device_count()):
	info = p.get_device_info_by_index(i)
	print (textcolors.green + str(info["index"]) + textcolors.end + ": \t %s \n \t %s \n" % (info["name"], p.get_host_api_info_by_index(info["hostApi"])["name"]))

	if default_device_index == -1:
	default_device_index = info["index"]

	#Handle no devices available
	if default_device_index == -1:
	print (textcolors.fail + "No device available. Quitting." + textcolors.end)
	exit()

	device_int = input("Choose device [" + textcolors.blue + str(default_device_index) + textcolors.end + "]: ")
	#Get input or default
	if device_int == '':
	device_int=int(default_device_index["index"])
	print(default_device_index)
	print("got it")
	device_id = int(device_int)
	print ("")

	#Get device info
	try:
	device_info = p.get_device_info_by_index(device_id)
	except IOError:
	device_info = p.get_device_info_by_index(default_device_index)
	print (textcolors.warning + "Selection not available, using default." + textcolors.end)

	#Choose between loopback or standard mode
	is_input = device_info["maxInputChannels"] > 0
	is_wasapi = (p.get_host_api_info_by_index(device_info["hostApi"])["name"]).find("WASAPI") != -1
	if is_input:
	print (textcolors.blue + "Selection is input using standard mode.\n" + textcolors.end)
	else:
	if is_wasapi:
	useloopback = True;
	print (textcolors.green + "Selection is output. Using loopback mode.\n" + textcolors.end)
	else:
	print (textcolors.fail + "Selection is input and does not support loopback mode. Quitting.\n" + textcolors.end)
	exit()

	recordtime = int(input("Record time in seconds [" + textcolors.blue + str(recordtime) + textcolors.end + "]: ") or recordtime)

	# WASAPI: IAudioClient2 set properties: IsOffload = 0, Category = 0, Options = 0
	# wFormatTag =WAVE_FORMAT_PCM
	# nChannels =2
	# nSamplesPerSec =48000
	# nAvgBytesPerSec=192000
	# nBlockAlign =4
	# wBitsPerSample =16
	# cbSize =0
	# WASAPI::OpenStream(input): framesPerUser[ 512 ] framesPerHost[ 1056 ] latency[ 22.00ms ] exclusive[ NO ] wow64_fix[ NO ] mode[ POLL ]
	# WASAPI: thread[ priority-0xD class-0x20 ]

	#Open stream
	channelcount = device_info["maxInputChannels"] if (device_info["maxOutputChannels"] < device_info["maxInputChannels"]) else device_info["maxOutputChannels"]

	stream = p.open(format = pyaudio.paInt16,
	channels = channelcount,
	rate = int(device_info["defaultSampleRate"]),
	input = True,
	frames_per_buffer = defaultframes,
	input_device_index = device_info["index"],
	stream_callback=process_audio,
	as_loopback = useloopback)

	# start the stream
	stream.start_stream()

	#Start Recording
	print (textcolors.blue + "Starting..." + textcolors.end)

	try:
	while stream.is_active():
	time.sleep(0.1)
	recordtime = float(recordtime) - 0.1
	if recordtime <= 0:
	stream.stop_stream()
	stream.close()
	p.terminate()
	break
	except KeyboardInterrupt:
	# PyAudio
	stream.stop_stream()
	stream.close()
	p.terminate()
	print('Finished recording.')
	# DeepSpeech
	filename = input("Save as [" + textcolors.blue + "out.wav" + textcolors.end + "]: ") or "out.wav"

	waveFile = wave.open(filename, 'wb')
	waveFile.setnchannels(channelcount)
	waveFile.setsampwidth(p.get_sample_size(pyaudio.paInt16))
	waveFile.setframerate(int(device_info["defaultSampleRate"]))
	waveFile.writeframes(b''.join(recorded_frames))
	waveFile.close()
	waveFile_down = wave.open(filename+".down.wav", 'wb')
	waveFile_down.setnchannels(channelcount)
	waveFile_down.setsampwidth(p.get_sample_size(pyaudio.paInt16))
	waveFile_down.setframerate(16000)
	waveFile_down.writeframes(b''.join(recorded_frames_down))
	waveFile_down.close()
	text = context.finishStream()
	print('Final text = {}'.format(text))

	print (textcolors.blue + "End." + textcolors.end)