Skip to content

Instantly share code, notes, and snippets.

@nobonobo
Last active March 10, 2017 17:55
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save nobonobo/5284646 to your computer and use it in GitHub Desktop.
Save nobonobo/5284646 to your computer and use it in GitHub Desktop.
[Google音声認識サンプル] 依存:numpy、PyAudio==0.2.7、flac-1ツール(http://flac.sourceforge.net/download.html
#/usr/bin/env python
# encoding: utf-8
import os
import sys
import atexit
import json
import time
import tempfile
import wave
import traceback
import urllib2
from subprocess import check_output
from Queue import Queue, Empty
import numpy as np
import pyaudio
class Spectrum(object):
FORMAT = pyaudio.paFloat32
CHANNELS = 1
FRAME_SIZE = 512
RATE = 16000 # Hz
def frames(self, n):
return int(n*self.RATE/self.FRAME_SIZE)
def __init__(self):
self.speak = Queue()
self.pa = pyaudio.PyAudio()
self.last_samples = None
atexit.register(self.pa.terminate)
# fft結果のインデックスに対応する周波数値の計算。今回使わなかった。
# self.freq = np.fft.fftfreq(self.FRAME_SIZE, d=self.RATE**-1)
self.begin = self.FRAME_SIZE*3/8
self.end = self.FRAME_SIZE/2
self.fque = np.zeros((self.frames(1.0), self.end-self.begin), np.float32)
self.buff = np.zeros((self.frames(5.0), 512), np.float32)
def fft(self, samples):
win = np.hanning(len(samples))
res = np.fft.fftshift(np.fft.fft(win*samples))
return 20*np.log10(np.abs(res))
def callback(self, in_data, frame_count, time_info, status):
try:
data = np.fromstring(in_data, np.float32)
self.buff[0] = data
self.buff = np.roll(self.buff, -1, axis=0)
if self.status == 0: # 切り出しを始めたら環境音成分平均値の更新は一時停止。
self.fque = np.roll(self.fque, 1, axis=0)
self.fque[0] = self.fft(data)[self.begin:self.end]
# これが環境音成分の平均値
average = np.average(self.fque, axis=0)
values = self.fque[0] - average # fft結果から差っ引く
volume = np.average(values)
if self.status:
self.count += 1
else:
self.count == 0
if self.status < 5:
if volume>5:
self.status += 1
else:
self.status = 0
elif self.status == 5:
if volume<5:
self.status += 1
elif self.status < 15:
if volume<5:
self.status += 1
else:
self.status -= 1
else:
self.status = 0
self.speak.put(self.buff[-self.count-2:])
if self.debug:
pr = [min(9, max(0, int(v/10))) for v in values]
print ''.join([str(i) for i in pr]), self.status
return (in_data, self.recording)
except KeyboardInterrupt:
self.recording = pyaudio.paAbort
def start(self, debug=False):
self.debug = debug
self.status = 0
self.count = 0
self.recording = pyaudio.paContinue
self.stream = self.pa.open(format = self.FORMAT,
channels = self.CHANNELS,
rate = self.RATE,
input = True,
output = False,
frames_per_buffer = self.FRAME_SIZE,
stream_callback = self.callback)
self.stream.start_stream()
def stop(self):
self.recording = pyaudio.paAbort
while self.stream.is_active():
time.sleep(0.5)
self.stream.start_stream()
self.stream.close()
RECOGNIZE_URL = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&lang=ja-JP"
# RECOGNIZE_URL += "&maxresult=10" # これで候補のトップ10が返る。
FLAC_TOOL = 'flac'
def recognize(fpath):
flac = open(fpath,"rb").read()
header = {'Content-Type' : 'audio/x-flac; rate=16000'}
req = urllib2.Request(RECOGNIZE_URL, flac, header)
data = urllib2.urlopen(req)
params = json.loads(data.read())
return params
def main(spe):
while 1:
try:
buff = spe.speak.get(timeout=3)
with tempfile.NamedTemporaryFile(suffix='.wav') as fp:
f = wave.open(fp, 'w')
f.setnchannels(1)
f.setsampwidth(2)
f.setframerate(16000)
f.writeframes(np.int16(buff*32768).tostring())
f.close()
check_output([FLAC_TOOL, '-sf', fp.name])
output = os.path.splitext(fp.name)[0] + '.flac'
res = recognize(output)
for i in res.get('hypotheses', []):
print i['confidence'], i['utterance']
except KeyboardInterrupt:
raise SystemExit(0)
except Empty:
pass
except:
traceback.print_exc()
time.sleep(5)
if __name__=='__main__':
spe = Spectrum()
spe.start(False)
try:
main(spe)
finally:
spe.stop()
@nobonobo
Copy link
Author

nobonobo commented Apr 1, 2013

認識サンプルー:

0.627582 もしもーし
0.7055486 こんばんは
0.6401324 えっ
0.27831447 えっ ライト
0.39178658 豆丁
0.56470203 京都
0.27980527 京都
0.38748074 まめちゃん
0.56085163 あめちゃん 京都
0.118296765 京都 今日
0.35291997 京都 香
0.23023404 特許 mame
0.62664324 東京都
0.4853431 東京都 こうちゃん
0.41771716 東京都 紅茶
0.49495977 うー
0.31954005 登る 上る 登る
0.69323325 面白いね
0.68706167 面白いね
0.30382764 ニコでた
0.14218235 にこ
0.71369535 マメルリハ
0.7092317 オカメインコ
0.71676314 セキセイインコ
0.7160014 精飲子 ジュウシマツ
0.34830737 嶋津 うずら
0.71328956 ちよこれいと
0.52314687 レイクトローラ 甲府
0.29295838 株 東
0.27502802 デカクチバシ
0.095073506 うー
0.41421902 yeah
0.6142067 やほう
0.41653907 canon
0.5651796 波浪
0.59454215 波浪注意報
0.5844786 波浪注意報
0.5838898 今日は桜が満開です
0.6356832 もしもーし
0.54706 マミンカですか
0.50342584 東京バナナ

@nobonobo
Copy link
Author

nobonobo commented Apr 1, 2013

雑音が切り出しの端っこに混入した時のブツッっていうのを緩和させていないので「えっ」「京都」とかが結構入る。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment