Created
April 23, 2024 00:43
-
-
Save Nekodigi/5789392a8caf8696077589ee0894a7c4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import numpy as np | |
import sounddevice as sd | |
import speech_recognition as sr | |
print(sd.query_devices()) # Displays all audio devices | |
patience = 0.1 | |
sample_rate = 44100 # CD-quality sample rate | |
channels = 1 # mono audio | |
last = 0.0 | |
amplification_factor = 20 # Increase the volume by this multiplier | |
# Define a loudness threshold to trigger playback | |
loudness_threshold = 0.01 # Adjust this to set sensitivity | |
recognizer = sr.Recognizer() | |
voice_buffer = [] | |
# Function to check if the audio is loud | |
def is_loud(data, threshold): | |
return np.abs(data).mean() > threshold | |
# Function to convert audio data to text using SpeechRecognition | |
def speech_to_text(data, sample_rate): | |
# Convert numpy array to audio data | |
audio_data = sr.AudioData(data.tobytes(), sample_rate, 2) | |
try: | |
# Use Google's speech recognition engine to transcribe audio | |
text = recognizer.recognize_google(audio_data) | |
return text | |
except sr.UnknownValueError: | |
# If speech is unclear, return None | |
return None | |
def analyze_voice_data(buffer, sample_rate): | |
# Convert buffered audio to a single chunk of audio data | |
if not buffer: | |
return None | |
else: | |
print(len(buffer)) | |
audio_data = np.concatenate(buffer) | |
# Convert to AudioData for speech recognition | |
audio_as_audio_data = sr.AudioData(audio_data.tobytes(), sample_rate, 2) | |
try: | |
# Use Google's speech recognition engine to transcribe the voice data | |
text = recognizer.recognize_google(audio_as_audio_data, language="ja-JP") | |
print("Analyzed text:", text) | |
return text | |
except sr.UnknownValueError: | |
print("Couldn't recognize speech.") | |
return None | |
def amplify_audio(data, factor): | |
return data * factor # Multiply audio data by the amplification factor | |
# Define a callback function for recording and playback | |
def callback(indata, outdata, frames, _time, status): | |
global last, voice_buffer | |
loud = is_loud(indata, loudness_threshold) | |
indata = amplify_audio(indata, amplification_factor) | |
# Check if the recorded audio is loud | |
if loud: | |
# print("recording") | |
last = time.time() | |
# If it's loud, play it back through the output | |
outdata[:] = indata # Play back the recorded audio | |
voice_buffer.append(indata.copy()) | |
# text = speech_to_text(indata, sample_rate) | |
# if text: | |
# print("Detected speech:", text) | |
else: | |
if time.time() - last > patience: # type: ignore | |
# print("outputing") | |
# If it's not loud, output silence | |
outdata.fill(0) # Output silence when below threshold | |
text = analyze_voice_data(voice_buffer, sample_rate) | |
if text: | |
print(text) | |
# print("analyzed") | |
voice_buffer = [] | |
else: | |
# print("1:recording") | |
outdata[:] = indata | |
voice_buffer.append(indata.copy()) | |
# Create a duplex stream to record and play back in real-time | |
with sd.Stream( | |
device=(0, None), | |
samplerate=sample_rate, | |
channels=channels, | |
dtype="float32", | |
callback=callback, | |
): | |
# Continue recording and playing back endlessly | |
print("Recording and playing back in real-time...") | |
while True: | |
# Loop to keep the stream open | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment