Skip to content

Instantly share code, notes, and snippets.

@Nekodigi
Created April 23, 2024 00:43
Show Gist options
  • Save Nekodigi/5789392a8caf8696077589ee0894a7c4 to your computer and use it in GitHub Desktop.
Save Nekodigi/5789392a8caf8696077589ee0894a7c4 to your computer and use it in GitHub Desktop.
import time
import numpy as np
import sounddevice as sd
import speech_recognition as sr
print(sd.query_devices()) # Displays all audio devices
patience = 0.1
sample_rate = 44100 # CD-quality sample rate
channels = 1 # mono audio
last = 0.0
amplification_factor = 20 # Increase the volume by this multiplier
# Define a loudness threshold to trigger playback
loudness_threshold = 0.01 # Adjust this to set sensitivity
recognizer = sr.Recognizer()
voice_buffer = []
# Function to check if the audio is loud
def is_loud(data, threshold):
return np.abs(data).mean() > threshold
# Function to convert audio data to text using SpeechRecognition
def speech_to_text(data, sample_rate):
# Convert numpy array to audio data
audio_data = sr.AudioData(data.tobytes(), sample_rate, 2)
try:
# Use Google's speech recognition engine to transcribe audio
text = recognizer.recognize_google(audio_data)
return text
except sr.UnknownValueError:
# If speech is unclear, return None
return None
def analyze_voice_data(buffer, sample_rate):
# Convert buffered audio to a single chunk of audio data
if not buffer:
return None
else:
print(len(buffer))
audio_data = np.concatenate(buffer)
# Convert to AudioData for speech recognition
audio_as_audio_data = sr.AudioData(audio_data.tobytes(), sample_rate, 2)
try:
# Use Google's speech recognition engine to transcribe the voice data
text = recognizer.recognize_google(audio_as_audio_data, language="ja-JP")
print("Analyzed text:", text)
return text
except sr.UnknownValueError:
print("Couldn't recognize speech.")
return None
def amplify_audio(data, factor):
return data * factor # Multiply audio data by the amplification factor
# Define a callback function for recording and playback
def callback(indata, outdata, frames, _time, status):
global last, voice_buffer
loud = is_loud(indata, loudness_threshold)
indata = amplify_audio(indata, amplification_factor)
# Check if the recorded audio is loud
if loud:
# print("recording")
last = time.time()
# If it's loud, play it back through the output
outdata[:] = indata # Play back the recorded audio
voice_buffer.append(indata.copy())
# text = speech_to_text(indata, sample_rate)
# if text:
# print("Detected speech:", text)
else:
if time.time() - last > patience: # type: ignore
# print("outputing")
# If it's not loud, output silence
outdata.fill(0) # Output silence when below threshold
text = analyze_voice_data(voice_buffer, sample_rate)
if text:
print(text)
# print("analyzed")
voice_buffer = []
else:
# print("1:recording")
outdata[:] = indata
voice_buffer.append(indata.copy())
# Create a duplex stream to record and play back in real-time
with sd.Stream(
device=(0, None),
samplerate=sample_rate,
channels=channels,
dtype="float32",
callback=callback,
):
# Continue recording and playing back endlessly
print("Recording and playing back in real-time...")
while True:
# Loop to keep the stream open
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment