Skip to content

Instantly share code, notes, and snippets.

@angelovescio
Created February 18, 2022 01:54
Show Gist options
  • Save angelovescio/6888ae61039485e9622c0295c7dc3623 to your computer and use it in GitHub Desktop.
Save angelovescio/6888ae61039485e9622c0295c7dc3623 to your computer and use it in GitHub Desktop.
Pump streaming audio to deepspeech
# download the pyaudio from https://github.com/intxcc/pyaudio_portaudio
import deepspeech
import numpy as np
import os
import pyaudio
import time
import samplerate as sr
import nnresample
from scipy import signal,io
import audioop
import wave
recorded_frames = []
recorded_frames_down = []
def downsampleWav(src, inrate=44100, outrate=16000, inchannels=1, outchannels=1):
data = src
retval = None
try:
converted = audioop.ratecv(data, 2, inchannels, inrate, outrate, None)
if outchannels == 1 & inchannels != 1:
converted[0] = audioop.tomono(converted[0], 2, 1, 0)
recorded_frames_down.append(converted[0])
retval = np.frombuffer(converted[0],dtype=np.int16)
except:
print('Failed to downsample wav')
return retval
return retval
# DeepSpeech parameters
DEEPSPEECH_MODEL_DIR = 'deepspeech-0.9.3-models'
MODEL_FILE_PATH = os.path.join(DEEPSPEECH_MODEL_DIR, 'deepspeech-0.9.3-models.pbmm')
BEAM_WIDTH = 500
LM_FILE_PATH = os.path.join(DEEPSPEECH_MODEL_DIR, 'lm.binary')
TRIE_FILE_PATH = os.path.join(DEEPSPEECH_MODEL_DIR, 'trie')
LM_ALPHA = 0.75
LM_BETA = 1.85
# Make DeepSpeech Model
model = deepspeech.Model(MODEL_FILE_PATH)
# model.enableDecoderWithLM(LM_FILE_PATH, TRIE_FILE_PATH, LM_ALPHA, LM_BETA)
# Create a Streaming session
context = model.createStream()
# Encapsulate DeepSpeech audio feeding into a callback for PyAudio
text_so_far = ''
def process_audio(in_data, frame_count, time_info, status):
global text_so_far
data16 = downsampleWav(in_data,int(device_info["defaultSampleRate"]),16000,channelcount,1)
# data16 = np.frombuffer(resamp_data, dtype=np.int16)
# data16 = np.frombuffer(in_data, dtype=np.int16)
# print(data16)
context.feedAudioContent(data16)
text = context.intermediateDecode()
if text != text_so_far:
print('Interim text = {}'.format(text))
text_so_far = text
recorded_frames.append(in_data)
return (in_data, pyaudio.paContinue)
defaultframes = 2048
class textcolors:
if not os.name == 'nt':
blue = '\033[94m'
green = '\033[92m'
warning = '\033[93m'
fail = '\033[91m'
end = '\033[0m'
else:
blue = ''
green = ''
warning = ''
fail = ''
end = ''
recorded_frames = []
device_info = {}
useloopback = False
recordtime = 5
#Use module
p = pyaudio.PyAudio()
#Set default to first in list or ask Windows
try:
default_device_index = p.get_default_input_device_info()
except IOError:
default_device_index = -1
#Select Device
print (textcolors.blue + "Available devices:\n" + textcolors.end)
for i in range(0, p.get_device_count()):
info = p.get_device_info_by_index(i)
print (textcolors.green + str(info["index"]) + textcolors.end + ": \t %s \n \t %s \n" % (info["name"], p.get_host_api_info_by_index(info["hostApi"])["name"]))
if default_device_index == -1:
default_device_index = info["index"]
#Handle no devices available
if default_device_index == -1:
print (textcolors.fail + "No device available. Quitting." + textcolors.end)
exit()
device_int = input("Choose device [" + textcolors.blue + str(default_device_index) + textcolors.end + "]: ")
#Get input or default
if device_int == '':
device_int=int(default_device_index["index"])
print(default_device_index)
print("got it")
device_id = int(device_int)
print ("")
#Get device info
try:
device_info = p.get_device_info_by_index(device_id)
except IOError:
device_info = p.get_device_info_by_index(default_device_index)
print (textcolors.warning + "Selection not available, using default." + textcolors.end)
#Choose between loopback or standard mode
is_input = device_info["maxInputChannels"] > 0
is_wasapi = (p.get_host_api_info_by_index(device_info["hostApi"])["name"]).find("WASAPI") != -1
if is_input:
print (textcolors.blue + "Selection is input using standard mode.\n" + textcolors.end)
else:
if is_wasapi:
useloopback = True;
print (textcolors.green + "Selection is output. Using loopback mode.\n" + textcolors.end)
else:
print (textcolors.fail + "Selection is input and does not support loopback mode. Quitting.\n" + textcolors.end)
exit()
recordtime = int(input("Record time in seconds [" + textcolors.blue + str(recordtime) + textcolors.end + "]: ") or recordtime)
# WASAPI: IAudioClient2 set properties: IsOffload = 0, Category = 0, Options = 0
# wFormatTag =WAVE_FORMAT_PCM
# nChannels =2
# nSamplesPerSec =48000
# nAvgBytesPerSec=192000
# nBlockAlign =4
# wBitsPerSample =16
# cbSize =0
# WASAPI::OpenStream(input): framesPerUser[ 512 ] framesPerHost[ 1056 ] latency[ 22.00ms ] exclusive[ NO ] wow64_fix[ NO ] mode[ POLL ]
# WASAPI: thread[ priority-0xD class-0x20 ]
#Open stream
channelcount = device_info["maxInputChannels"] if (device_info["maxOutputChannels"] < device_info["maxInputChannels"]) else device_info["maxOutputChannels"]
stream = p.open(format = pyaudio.paInt16,
channels = channelcount,
rate = int(device_info["defaultSampleRate"]),
input = True,
frames_per_buffer = defaultframes,
input_device_index = device_info["index"],
stream_callback=process_audio,
as_loopback = useloopback)
# start the stream
stream.start_stream()
#Start Recording
print (textcolors.blue + "Starting..." + textcolors.end)
try:
while stream.is_active():
time.sleep(0.1)
recordtime = float(recordtime) - 0.1
if recordtime <= 0:
stream.stop_stream()
stream.close()
p.terminate()
break
except KeyboardInterrupt:
# PyAudio
stream.stop_stream()
stream.close()
p.terminate()
print('Finished recording.')
# DeepSpeech
filename = input("Save as [" + textcolors.blue + "out.wav" + textcolors.end + "]: ") or "out.wav"
waveFile = wave.open(filename, 'wb')
waveFile.setnchannels(channelcount)
waveFile.setsampwidth(p.get_sample_size(pyaudio.paInt16))
waveFile.setframerate(int(device_info["defaultSampleRate"]))
waveFile.writeframes(b''.join(recorded_frames))
waveFile.close()
waveFile_down = wave.open(filename+".down.wav", 'wb')
waveFile_down.setnchannels(channelcount)
waveFile_down.setsampwidth(p.get_sample_size(pyaudio.paInt16))
waveFile_down.setframerate(16000)
waveFile_down.writeframes(b''.join(recorded_frames_down))
waveFile_down.close()
text = context.finishStream()
print('Final text = {}'.format(text))
print (textcolors.blue + "End." + textcolors.end)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment