Nekodigi/playback_tts.py

## playback_tts.py
import time

import numpy as np
import sounddevice as sd
import speech_recognition as sr

print(sd.query_devices())  # Displays all audio devices

patience = 0.1
sample_rate = 44100  # CD-quality sample rate
channels = 1  # mono audio
last = 0.0

amplification_factor = 20  # Increase the volume by this multiplier


# Define a loudness threshold to trigger playback
loudness_threshold = 0.01  # Adjust this to set sensitivity

recognizer = sr.Recognizer()

voice_buffer = []


# Function to check if the audio is loud
def is_loud(data, threshold):
    return np.abs(data).mean() > threshold


# Function to convert audio data to text using SpeechRecognition
def speech_to_text(data, sample_rate):
    # Convert numpy array to audio data
    audio_data = sr.AudioData(data.tobytes(), sample_rate, 2)
    try:
        # Use Google's speech recognition engine to transcribe audio
        text = recognizer.recognize_google(audio_data)
        return text
    except sr.UnknownValueError:
        # If speech is unclear, return None
        return None


def analyze_voice_data(buffer, sample_rate):
    # Convert buffered audio to a single chunk of audio data
    if not buffer:
        return None
    else:
        print(len(buffer))
    audio_data = np.concatenate(buffer)
    # Convert to AudioData for speech recognition
    audio_as_audio_data = sr.AudioData(audio_data.tobytes(), sample_rate, 2)
    try:
        # Use Google's speech recognition engine to transcribe the voice data
        text = recognizer.recognize_google(audio_as_audio_data, language="ja-JP")
        print("Analyzed text:", text)
        return text
    except sr.UnknownValueError:
        print("Couldn't recognize speech.")
        return None


def amplify_audio(data, factor):
    return data * factor  # Multiply audio data by the amplification factor


# Define a callback function for recording and playback
def callback(indata, outdata, frames, _time, status):
    global last, voice_buffer

    loud = is_loud(indata, loudness_threshold)
    indata = amplify_audio(indata, amplification_factor)
    # Check if the recorded audio is loud
    if loud:
        # print("recording")
        last = time.time()
        # If it's loud, play it back through the output
        outdata[:] = indata  # Play back the recorded audio
        voice_buffer.append(indata.copy())
        # text = speech_to_text(indata, sample_rate)
        # if text:
        #     print("Detected speech:", text)
    else:
        if time.time() - last > patience:  # type: ignore
            # print("outputing")
            # If it's not loud, output silence
            outdata.fill(0)  # Output silence when below threshold
            text = analyze_voice_data(voice_buffer, sample_rate)
            if text:
                print(text)
            # print("analyzed")
            voice_buffer = []
        else:
            # print("1:recording")
            outdata[:] = indata
            voice_buffer.append(indata.copy())


# Create a duplex stream to record and play back in real-time
with sd.Stream(
    device=(0, None),
    samplerate=sample_rate,
    channels=channels,
    dtype="float32",
    callback=callback,
):
    # Continue recording and playing back endlessly
    print("Recording and playing back in real-time...")
    while True:
        # Loop to keep the stream open
        pass
	import time

	import numpy as np
	import sounddevice as sd
	import speech_recognition as sr

	print(sd.query_devices()) # Displays all audio devices

	patience = 0.1
	sample_rate = 44100 # CD-quality sample rate
	channels = 1 # mono audio
	last = 0.0

	amplification_factor = 20 # Increase the volume by this multiplier


	# Define a loudness threshold to trigger playback
	loudness_threshold = 0.01 # Adjust this to set sensitivity

	recognizer = sr.Recognizer()

	voice_buffer = []


	# Function to check if the audio is loud
	def is_loud(data, threshold):
	return np.abs(data).mean() > threshold


	# Function to convert audio data to text using SpeechRecognition
	def speech_to_text(data, sample_rate):
	# Convert numpy array to audio data
	audio_data = sr.AudioData(data.tobytes(), sample_rate, 2)
	try:
	# Use Google's speech recognition engine to transcribe audio
	text = recognizer.recognize_google(audio_data)
	return text
	except sr.UnknownValueError:
	# If speech is unclear, return None
	return None


	def analyze_voice_data(buffer, sample_rate):
	# Convert buffered audio to a single chunk of audio data
	if not buffer:
	return None
	else:
	print(len(buffer))
	audio_data = np.concatenate(buffer)
	# Convert to AudioData for speech recognition
	audio_as_audio_data = sr.AudioData(audio_data.tobytes(), sample_rate, 2)
	try:
	# Use Google's speech recognition engine to transcribe the voice data
	text = recognizer.recognize_google(audio_as_audio_data, language="ja-JP")
	print("Analyzed text:", text)
	return text
	except sr.UnknownValueError:
	print("Couldn't recognize speech.")
	return None


	def amplify_audio(data, factor):
	return data * factor # Multiply audio data by the amplification factor


	# Define a callback function for recording and playback
	def callback(indata, outdata, frames, _time, status):
	global last, voice_buffer

	loud = is_loud(indata, loudness_threshold)
	indata = amplify_audio(indata, amplification_factor)
	# Check if the recorded audio is loud
	if loud:
	# print("recording")
	last = time.time()
	# If it's loud, play it back through the output
	outdata[:] = indata # Play back the recorded audio
	voice_buffer.append(indata.copy())
	# text = speech_to_text(indata, sample_rate)
	# if text:
	# print("Detected speech:", text)
	else:
	if time.time() - last > patience: # type: ignore
	# print("outputing")
	# If it's not loud, output silence
	outdata.fill(0) # Output silence when below threshold
	text = analyze_voice_data(voice_buffer, sample_rate)
	if text:
	print(text)
	# print("analyzed")
	voice_buffer = []
	else:
	# print("1:recording")
	outdata[:] = indata
	voice_buffer.append(indata.copy())


	# Create a duplex stream to record and play back in real-time
	with sd.Stream(
	device=(0, None),
	samplerate=sample_rate,
	channels=channels,
	dtype="float32",
	callback=callback,
	):
	# Continue recording and playing back endlessly
	print("Recording and playing back in real-time...")
	while True:
	# Loop to keep the stream open
	pass