xiong-jie-y/yamnet.py

## yamnet.py
"""This script is to try yamnet on the microphone.

Usage:

pip install tensorflow
pip install tensorflow_hub
pip install PyAudio
pip install librosa

python yamnet.py
"""

import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
import librosa
import pyaudio

from collections import deque

# Find the name of the class with the top score when mean-aggregated across frames.
def class_names_from_csv(class_map_csv_text):
    """Returns list of class names corresponding to score vector."""
    class_names = []
    with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            class_names.append(row['display_name'])

    return class_names

class HumanVoiceDetector:
    def __init__(self):
        self.model = hub.load('https://tfhub.dev/google/yamnet/1')

    def wait_for_human_voice(self):
        class_map_path = self.model.class_map_path().numpy()
        class_names = class_names_from_csv(class_map_path)

        # 0.1 [s]
        # And this is accumulated to circuluer buffer later.
        frame_len = int(16000 * 0.1)

        p = pyaudio.PyAudio()
        stream = p.open(format=pyaudio.paInt16,
                        channels=1,
                        rate=16000,
                        input=True,
                        frames_per_buffer=frame_len)

        buffers = deque()

        while True:
            data = stream.read(frame_len, exception_on_overflow=False)
            frame_data = librosa.util.buf_to_float(data, n_bytes=2, dtype=np.int16)

            buffers.append(frame_data)
            if len(buffers) > 9:
                buffers.popleft()

                scores, embeddings, spectrogram = self.model(np.concatenate(buffers))

                class_name = class_names[np.argmax(scores[0])]

                print(class_name)

        stream.stop_stream()
        stream.close()
        p.terminate()

if __name__ == "__main__":
    voice_detector = HumanVoiceDetector()
    voice_detector.wait_for_human_voice()
	"""This script is to try yamnet on the microphone.

	Usage:

	pip install tensorflow
	pip install tensorflow_hub
	pip install PyAudio
	pip install librosa

	python yamnet.py
	"""

	import tensorflow as tf
	import tensorflow_hub as hub
	import numpy as np
	import csv
	import librosa
	import pyaudio

	from collections import deque

	# Find the name of the class with the top score when mean-aggregated across frames.
	def class_names_from_csv(class_map_csv_text):
	"""Returns list of class names corresponding to score vector."""
	class_names = []
	with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
	reader = csv.DictReader(csvfile)
	for row in reader:
	class_names.append(row['display_name'])

	return class_names

	class HumanVoiceDetector:
	def __init__(self):
	self.model = hub.load('https://tfhub.dev/google/yamnet/1')

	def wait_for_human_voice(self):
	class_map_path = self.model.class_map_path().numpy()
	class_names = class_names_from_csv(class_map_path)

	# 0.1 [s]
	# And this is accumulated to circuluer buffer later.
	frame_len = int(16000 * 0.1)

	p = pyaudio.PyAudio()
	stream = p.open(format=pyaudio.paInt16,
	channels=1,
	rate=16000,
	input=True,
	frames_per_buffer=frame_len)

	buffers = deque()

	while True:
	data = stream.read(frame_len, exception_on_overflow=False)
	frame_data = librosa.util.buf_to_float(data, n_bytes=2, dtype=np.int16)

	buffers.append(frame_data)
	if len(buffers) > 9:
	buffers.popleft()

	scores, embeddings, spectrogram = self.model(np.concatenate(buffers))

	class_name = class_names[np.argmax(scores[0])]

	print(class_name)

	stream.stop_stream()
	stream.close()
	p.terminate()

	if __name__ == "__main__":
	voice_detector = HumanVoiceDetector()
	voice_detector.wait_for_human_voice()