Last active
October 13, 2021 14:53
-
-
Save xiong-jie-y/c88e9f3e9516f0ecac7fb93393d3dc17 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""This script is to try yamnet on the microphone. | |
Usage: | |
pip install tensorflow | |
pip install tensorflow_hub | |
pip install PyAudio | |
pip install librosa | |
python yamnet.py | |
""" | |
import tensorflow as tf | |
import tensorflow_hub as hub | |
import numpy as np | |
import csv | |
import librosa | |
import pyaudio | |
from collections import deque | |
# Find the name of the class with the top score when mean-aggregated across frames. | |
def class_names_from_csv(class_map_csv_text): | |
"""Returns list of class names corresponding to score vector.""" | |
class_names = [] | |
with tf.io.gfile.GFile(class_map_csv_text) as csvfile: | |
reader = csv.DictReader(csvfile) | |
for row in reader: | |
class_names.append(row['display_name']) | |
return class_names | |
class HumanVoiceDetector: | |
def __init__(self): | |
self.model = hub.load('https://tfhub.dev/google/yamnet/1') | |
def wait_for_human_voice(self): | |
class_map_path = self.model.class_map_path().numpy() | |
class_names = class_names_from_csv(class_map_path) | |
# 0.1 [s] | |
# And this is accumulated to circuluer buffer later. | |
frame_len = int(16000 * 0.1) | |
p = pyaudio.PyAudio() | |
stream = p.open(format=pyaudio.paInt16, | |
channels=1, | |
rate=16000, | |
input=True, | |
frames_per_buffer=frame_len) | |
buffers = deque() | |
while True: | |
data = stream.read(frame_len, exception_on_overflow=False) | |
frame_data = librosa.util.buf_to_float(data, n_bytes=2, dtype=np.int16) | |
buffers.append(frame_data) | |
if len(buffers) > 9: | |
buffers.popleft() | |
scores, embeddings, spectrogram = self.model(np.concatenate(buffers)) | |
class_name = class_names[np.argmax(scores[0])] | |
print(class_name) | |
stream.stop_stream() | |
stream.close() | |
p.terminate() | |
if __name__ == "__main__": | |
voice_detector = HumanVoiceDetector() | |
voice_detector.wait_for_human_voice() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment