Last active
March 10, 2017 17:55
-
-
Save nobonobo/5284646 to your computer and use it in GitHub Desktop.
[Google音声認識サンプル]
依存:numpy、PyAudio==0.2.7、flac-1ツール(http://flac.sourceforge.net/download.html)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#/usr/bin/env python | |
# encoding: utf-8 | |
import os | |
import sys | |
import atexit | |
import json | |
import time | |
import tempfile | |
import wave | |
import traceback | |
import urllib2 | |
from subprocess import check_output | |
from Queue import Queue, Empty | |
import numpy as np | |
import pyaudio | |
class Spectrum(object): | |
FORMAT = pyaudio.paFloat32 | |
CHANNELS = 1 | |
FRAME_SIZE = 512 | |
RATE = 16000 # Hz | |
def frames(self, n): | |
return int(n*self.RATE/self.FRAME_SIZE) | |
def __init__(self): | |
self.speak = Queue() | |
self.pa = pyaudio.PyAudio() | |
self.last_samples = None | |
atexit.register(self.pa.terminate) | |
# fft結果のインデックスに対応する周波数値の計算。今回使わなかった。 | |
# self.freq = np.fft.fftfreq(self.FRAME_SIZE, d=self.RATE**-1) | |
self.begin = self.FRAME_SIZE*3/8 | |
self.end = self.FRAME_SIZE/2 | |
self.fque = np.zeros((self.frames(1.0), self.end-self.begin), np.float32) | |
self.buff = np.zeros((self.frames(5.0), 512), np.float32) | |
def fft(self, samples): | |
win = np.hanning(len(samples)) | |
res = np.fft.fftshift(np.fft.fft(win*samples)) | |
return 20*np.log10(np.abs(res)) | |
def callback(self, in_data, frame_count, time_info, status): | |
try: | |
data = np.fromstring(in_data, np.float32) | |
self.buff[0] = data | |
self.buff = np.roll(self.buff, -1, axis=0) | |
if self.status == 0: # 切り出しを始めたら環境音成分平均値の更新は一時停止。 | |
self.fque = np.roll(self.fque, 1, axis=0) | |
self.fque[0] = self.fft(data)[self.begin:self.end] | |
# これが環境音成分の平均値 | |
average = np.average(self.fque, axis=0) | |
values = self.fque[0] - average # fft結果から差っ引く | |
volume = np.average(values) | |
if self.status: | |
self.count += 1 | |
else: | |
self.count == 0 | |
if self.status < 5: | |
if volume>5: | |
self.status += 1 | |
else: | |
self.status = 0 | |
elif self.status == 5: | |
if volume<5: | |
self.status += 1 | |
elif self.status < 15: | |
if volume<5: | |
self.status += 1 | |
else: | |
self.status -= 1 | |
else: | |
self.status = 0 | |
self.speak.put(self.buff[-self.count-2:]) | |
if self.debug: | |
pr = [min(9, max(0, int(v/10))) for v in values] | |
print ''.join([str(i) for i in pr]), self.status | |
return (in_data, self.recording) | |
except KeyboardInterrupt: | |
self.recording = pyaudio.paAbort | |
def start(self, debug=False): | |
self.debug = debug | |
self.status = 0 | |
self.count = 0 | |
self.recording = pyaudio.paContinue | |
self.stream = self.pa.open(format = self.FORMAT, | |
channels = self.CHANNELS, | |
rate = self.RATE, | |
input = True, | |
output = False, | |
frames_per_buffer = self.FRAME_SIZE, | |
stream_callback = self.callback) | |
self.stream.start_stream() | |
def stop(self): | |
self.recording = pyaudio.paAbort | |
while self.stream.is_active(): | |
time.sleep(0.5) | |
self.stream.start_stream() | |
self.stream.close() | |
RECOGNIZE_URL = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&lang=ja-JP" | |
# RECOGNIZE_URL += "&maxresult=10" # これで候補のトップ10が返る。 | |
FLAC_TOOL = 'flac' | |
def recognize(fpath): | |
flac = open(fpath,"rb").read() | |
header = {'Content-Type' : 'audio/x-flac; rate=16000'} | |
req = urllib2.Request(RECOGNIZE_URL, flac, header) | |
data = urllib2.urlopen(req) | |
params = json.loads(data.read()) | |
return params | |
def main(spe): | |
while 1: | |
try: | |
buff = spe.speak.get(timeout=3) | |
with tempfile.NamedTemporaryFile(suffix='.wav') as fp: | |
f = wave.open(fp, 'w') | |
f.setnchannels(1) | |
f.setsampwidth(2) | |
f.setframerate(16000) | |
f.writeframes(np.int16(buff*32768).tostring()) | |
f.close() | |
check_output([FLAC_TOOL, '-sf', fp.name]) | |
output = os.path.splitext(fp.name)[0] + '.flac' | |
res = recognize(output) | |
for i in res.get('hypotheses', []): | |
print i['confidence'], i['utterance'] | |
except KeyboardInterrupt: | |
raise SystemExit(0) | |
except Empty: | |
pass | |
except: | |
traceback.print_exc() | |
time.sleep(5) | |
if __name__=='__main__': | |
spe = Spectrum() | |
spe.start(False) | |
try: | |
main(spe) | |
finally: | |
spe.stop() |
雑音が切り出しの端っこに混入した時のブツッっていうのを緩和させていないので「えっ」「京都」とかが結構入る。
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
認識サンプルー:
0.627582 もしもーし
0.7055486 こんばんは
0.6401324 えっ
0.27831447 えっ ライト
0.39178658 豆丁
0.56470203 京都
0.27980527 京都
0.38748074 まめちゃん
0.56085163 あめちゃん 京都
0.118296765 京都 今日
0.35291997 京都 香
0.23023404 特許 mame
0.62664324 東京都
0.4853431 東京都 こうちゃん
0.41771716 東京都 紅茶
0.49495977 うー
0.31954005 登る 上る 登る
0.69323325 面白いね
0.68706167 面白いね
0.30382764 ニコでた
0.14218235 にこ
0.71369535 マメルリハ
0.7092317 オカメインコ
0.71676314 セキセイインコ
0.7160014 精飲子 ジュウシマツ
0.34830737 嶋津 うずら
0.71328956 ちよこれいと
0.52314687 レイクトローラ 甲府
0.29295838 株 東
0.27502802 デカクチバシ
0.095073506 うー
0.41421902 yeah
0.6142067 やほう
0.41653907 canon
0.5651796 波浪
0.59454215 波浪注意報
0.5844786 波浪注意報
0.5838898 今日は桜が満開です
0.6356832 もしもーし
0.54706 マミンカですか
0.50342584 東京バナナ