Created
June 25, 2018 14:47
-
-
Save srli/d6f2908f032c333f1e4fe022663b25cc to your computer and use it in GitHub Desktop.
Speech to text with PocketSphinx for Python3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pocketsphinx.pocketsphinx import * | |
from sphinxbase.sphinxbase import * | |
import os | |
import pyaudio | |
import wave | |
import audioop | |
from collections import deque | |
import time | |
import math | |
“”” | |
Written by Sophie Li, 2016 | |
http://blog.justsophie.com/python-speech-to-text-with-pocketsphinx/ | |
“”” | |
class SpeechDetector: | |
def __init__(self): | |
# Microphone stream config. | |
self.CHUNK = 1024 # CHUNKS of bytes to read each time from mic | |
self.FORMAT = pyaudio.paInt16 | |
self.CHANNELS = 1 | |
self.RATE = 16000 | |
self.SILENCE_LIMIT = 1 # Silence limit in seconds. The max ammount of seconds where | |
# only silence is recorded. When this time passes the | |
# recording finishes and the file is decoded | |
self.PREV_AUDIO = 0.5 # Previous audio (in seconds) to prepend. When noise | |
# is detected, how much of previously recorded audio is | |
# prepended. This helps to prevent chopping the beginning | |
# of the phrase. | |
self.THRESHOLD = 4500 | |
self.num_phrases = -1 | |
# These will need to be modified according to where the pocketsphinx folder is | |
MODELDIR = “pocketsphinx/model” | |
DATADIR = “pocketsphinx/test/data” | |
# Create a decoder with certain model | |
config = Decoder.default_config() | |
config.set_string(‘-hmm’, os.path.join(MODELDIR, ‘en-us/en-us’)) | |
config.set_string(‘-lm’, os.path.join(MODELDIR, ‘en-us/en-us.lm.bin’)) | |
config.set_string(‘-dict’, os.path.join(MODELDIR, ‘en-us/cmudict-en-us.dict’)) | |
# Creaders decoder object for streaming data. | |
self.decoder = Decoder(config) | |
def setup_mic(self, num_samples=50): | |
“”” Gets average audio intensity of your mic sound. You can use it to get | |
average intensities while you’re talking and/or silent. The average | |
is the avg of the .2 of the largest intensities recorded. | |
“”” | |
print (“Getting intensity values from mic.”) | |
p = pyaudio.PyAudio() | |
stream = p.open(format=self.FORMAT, | |
channels=self.CHANNELS, | |
rate=self.RATE, | |
input=True, | |
frames_per_buffer=self.CHUNK) | |
values = [math.sqrt(abs(audioop.avg(stream.read(self.CHUNK), 4))) | |
for x in range(num_samples)] | |
values = sorted(values, reverse=True) | |
r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2) | |
print (” Finished “) | |
print (” Average audio intensity is %s ” % r) | |
stream.close() | |
p.terminate() | |
if r self.THRESHOLD for x in slid_win]) > 0: | |
if started == False: | |
print (“Starting recording of phrase”) | |
started = True | |
audio2send.append(cur_data) | |
elif started: | |
print (“Finished recording, decoding phrase”) | |
filename = self.save_speech(list(prev_audio) + audio2send, p) | |
r = self.decode_phrase(filename) | |
print (“DETECTED: %s” % r) | |
# Removes temp audio file | |
os.remove(filename) | |
# Reset all | |
started = False | |
slid_win = deque(maxlen=int(self.SILENCE_LIMIT * rel)) | |
prev_audio = deque(maxlen=int(0.5 * rel)) | |
audio2send = [] | |
print (“Listening …”) | |
else: | |
prev_audio.append(cur_data) | |
print (“* Done listening”) | |
stream.close() | |
p.terminate() | |
if __name__ == “__main__”: | |
sd = SpeechDetector() | |
sd.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This version will work with python3 -
Raspberry Pi - Raspbian Stretch
USB mic ALSA audio device 2
sudo apt-get install swig libpulse-dev
sudo pip3 install pocketsphinx