Created
February 18, 2022 01:54
-
-
Save angelovescio/6888ae61039485e9622c0295c7dc3623 to your computer and use it in GitHub Desktop.
Pump streaming audio to deepspeech
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# download the pyaudio from https://github.com/intxcc/pyaudio_portaudio | |
import deepspeech | |
import numpy as np | |
import os | |
import pyaudio | |
import time | |
import samplerate as sr | |
import nnresample | |
from scipy import signal,io | |
import audioop | |
import wave | |
recorded_frames = [] | |
recorded_frames_down = [] | |
def downsampleWav(src, inrate=44100, outrate=16000, inchannels=1, outchannels=1): | |
data = src | |
retval = None | |
try: | |
converted = audioop.ratecv(data, 2, inchannels, inrate, outrate, None) | |
if outchannels == 1 & inchannels != 1: | |
converted[0] = audioop.tomono(converted[0], 2, 1, 0) | |
recorded_frames_down.append(converted[0]) | |
retval = np.frombuffer(converted[0],dtype=np.int16) | |
except: | |
print('Failed to downsample wav') | |
return retval | |
return retval | |
# DeepSpeech parameters | |
DEEPSPEECH_MODEL_DIR = 'deepspeech-0.9.3-models' | |
MODEL_FILE_PATH = os.path.join(DEEPSPEECH_MODEL_DIR, 'deepspeech-0.9.3-models.pbmm') | |
BEAM_WIDTH = 500 | |
LM_FILE_PATH = os.path.join(DEEPSPEECH_MODEL_DIR, 'lm.binary') | |
TRIE_FILE_PATH = os.path.join(DEEPSPEECH_MODEL_DIR, 'trie') | |
LM_ALPHA = 0.75 | |
LM_BETA = 1.85 | |
# Make DeepSpeech Model | |
model = deepspeech.Model(MODEL_FILE_PATH) | |
# model.enableDecoderWithLM(LM_FILE_PATH, TRIE_FILE_PATH, LM_ALPHA, LM_BETA) | |
# Create a Streaming session | |
context = model.createStream() | |
# Encapsulate DeepSpeech audio feeding into a callback for PyAudio | |
text_so_far = '' | |
def process_audio(in_data, frame_count, time_info, status): | |
global text_so_far | |
data16 = downsampleWav(in_data,int(device_info["defaultSampleRate"]),16000,channelcount,1) | |
# data16 = np.frombuffer(resamp_data, dtype=np.int16) | |
# data16 = np.frombuffer(in_data, dtype=np.int16) | |
# print(data16) | |
context.feedAudioContent(data16) | |
text = context.intermediateDecode() | |
if text != text_so_far: | |
print('Interim text = {}'.format(text)) | |
text_so_far = text | |
recorded_frames.append(in_data) | |
return (in_data, pyaudio.paContinue) | |
defaultframes = 2048 | |
class textcolors: | |
if not os.name == 'nt': | |
blue = '\033[94m' | |
green = '\033[92m' | |
warning = '\033[93m' | |
fail = '\033[91m' | |
end = '\033[0m' | |
else: | |
blue = '' | |
green = '' | |
warning = '' | |
fail = '' | |
end = '' | |
recorded_frames = [] | |
device_info = {} | |
useloopback = False | |
recordtime = 5 | |
#Use module | |
p = pyaudio.PyAudio() | |
#Set default to first in list or ask Windows | |
try: | |
default_device_index = p.get_default_input_device_info() | |
except IOError: | |
default_device_index = -1 | |
#Select Device | |
print (textcolors.blue + "Available devices:\n" + textcolors.end) | |
for i in range(0, p.get_device_count()): | |
info = p.get_device_info_by_index(i) | |
print (textcolors.green + str(info["index"]) + textcolors.end + ": \t %s \n \t %s \n" % (info["name"], p.get_host_api_info_by_index(info["hostApi"])["name"])) | |
if default_device_index == -1: | |
default_device_index = info["index"] | |
#Handle no devices available | |
if default_device_index == -1: | |
print (textcolors.fail + "No device available. Quitting." + textcolors.end) | |
exit() | |
device_int = input("Choose device [" + textcolors.blue + str(default_device_index) + textcolors.end + "]: ") | |
#Get input or default | |
if device_int == '': | |
device_int=int(default_device_index["index"]) | |
print(default_device_index) | |
print("got it") | |
device_id = int(device_int) | |
print ("") | |
#Get device info | |
try: | |
device_info = p.get_device_info_by_index(device_id) | |
except IOError: | |
device_info = p.get_device_info_by_index(default_device_index) | |
print (textcolors.warning + "Selection not available, using default." + textcolors.end) | |
#Choose between loopback or standard mode | |
is_input = device_info["maxInputChannels"] > 0 | |
is_wasapi = (p.get_host_api_info_by_index(device_info["hostApi"])["name"]).find("WASAPI") != -1 | |
if is_input: | |
print (textcolors.blue + "Selection is input using standard mode.\n" + textcolors.end) | |
else: | |
if is_wasapi: | |
useloopback = True; | |
print (textcolors.green + "Selection is output. Using loopback mode.\n" + textcolors.end) | |
else: | |
print (textcolors.fail + "Selection is input and does not support loopback mode. Quitting.\n" + textcolors.end) | |
exit() | |
recordtime = int(input("Record time in seconds [" + textcolors.blue + str(recordtime) + textcolors.end + "]: ") or recordtime) | |
# WASAPI: IAudioClient2 set properties: IsOffload = 0, Category = 0, Options = 0 | |
# wFormatTag =WAVE_FORMAT_PCM | |
# nChannels =2 | |
# nSamplesPerSec =48000 | |
# nAvgBytesPerSec=192000 | |
# nBlockAlign =4 | |
# wBitsPerSample =16 | |
# cbSize =0 | |
# WASAPI::OpenStream(input): framesPerUser[ 512 ] framesPerHost[ 1056 ] latency[ 22.00ms ] exclusive[ NO ] wow64_fix[ NO ] mode[ POLL ] | |
# WASAPI: thread[ priority-0xD class-0x20 ] | |
#Open stream | |
channelcount = device_info["maxInputChannels"] if (device_info["maxOutputChannels"] < device_info["maxInputChannels"]) else device_info["maxOutputChannels"] | |
stream = p.open(format = pyaudio.paInt16, | |
channels = channelcount, | |
rate = int(device_info["defaultSampleRate"]), | |
input = True, | |
frames_per_buffer = defaultframes, | |
input_device_index = device_info["index"], | |
stream_callback=process_audio, | |
as_loopback = useloopback) | |
# start the stream | |
stream.start_stream() | |
#Start Recording | |
print (textcolors.blue + "Starting..." + textcolors.end) | |
try: | |
while stream.is_active(): | |
time.sleep(0.1) | |
recordtime = float(recordtime) - 0.1 | |
if recordtime <= 0: | |
stream.stop_stream() | |
stream.close() | |
p.terminate() | |
break | |
except KeyboardInterrupt: | |
# PyAudio | |
stream.stop_stream() | |
stream.close() | |
p.terminate() | |
print('Finished recording.') | |
# DeepSpeech | |
filename = input("Save as [" + textcolors.blue + "out.wav" + textcolors.end + "]: ") or "out.wav" | |
waveFile = wave.open(filename, 'wb') | |
waveFile.setnchannels(channelcount) | |
waveFile.setsampwidth(p.get_sample_size(pyaudio.paInt16)) | |
waveFile.setframerate(int(device_info["defaultSampleRate"])) | |
waveFile.writeframes(b''.join(recorded_frames)) | |
waveFile.close() | |
waveFile_down = wave.open(filename+".down.wav", 'wb') | |
waveFile_down.setnchannels(channelcount) | |
waveFile_down.setsampwidth(p.get_sample_size(pyaudio.paInt16)) | |
waveFile_down.setframerate(16000) | |
waveFile_down.writeframes(b''.join(recorded_frames_down)) | |
waveFile_down.close() | |
text = context.finishStream() | |
print('Final text = {}'.format(text)) | |
print (textcolors.blue + "End." + textcolors.end) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment