iddar/fast_speech_text_speech-es.py Secret

## fast_speech_text_speech-es.py
""" Para usar: instale LLM studio (o Ollama), clone OpenVoice, ejecute este script en el directorio OpenVoice
    git clone https://github.com/myshell-ai/OpenVoice
    cd OpenVoice
    git clone https://huggingface.co/myshell-ai/OpenVoice
    cp -r OpenVoice/* .
    pip install whisper pynput pyaudio

    este proyecto se base en el script compartido por thomwolf  en https://gist.github.com/thomwolf/e9c3f978d0f82600a7c24cb0bf80d606
"""

from openai import OpenAI
import time
import pyaudio
import numpy as np
import torch
import os
import re
import se_extractor
import whisper
from pynput import keyboard
from api import BaseSpeakerTTS, ToneColorConverter
from utils import split_sentences_latin

from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
import IPython.display as ipd
from playsound import playsound
from scipy.io.wavfile import write as write_wav

models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
    "facebook/tts_transformer-es-css10",
    arg_overrides={"vocoder": "hifigan", "fp16": False}
)

model = models[0]
TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
generator = task.build_generator([model], cfg)


# SYSTEM_MESSAGE = "You are Bob an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
SYSTEM_MESSAGE_ES = "Eres Bob un asistente de IA. MANTÉN TUS RESPUESTAS MUY CORTAS Y CONVERSACIONALES."
SPEAKER_WAV = None

llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

tts_en_ckpt_base = os.path.join(os.path.dirname(__file__), "checkpoints/base_speakers/EN")
tts_ckpt_converter = os.path.join(os.path.dirname(__file__), "checkpoints/converter")
device = "cpu"

tts_model = BaseSpeakerTTS(f'{tts_en_ckpt_base}/config.json', device=device)
tts_model.load_ckpt(f'{tts_en_ckpt_base}/checkpoint.pth')

tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth')

en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device)

target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else (None, None)

sampling_rate = tts_model.hps.data.sampling_rate
mark = tts_model.language_marks.get("english", None)

asr_model = whisper.load_model("small")

def sound(wav, fs=8000):
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=len(wav.shape), rate=fs, output=True)
    stream.write(wav.tobytes())
    stream.stop_stream()
    stream.close()
    p.terminate()


def play_audio(text):
    sample = TTSHubInterface.get_model_input(task, text)
    wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
    wav_cpu = wav.to('cpu')
    write_wav("audio1.wav", rate, wav_cpu.numpy())
    playsound("audio1.wav")

def record_and_transcribe_audio():
    recording = False
    def on_press(key):
        nonlocal recording
        if key == keyboard.Key.shift:
            recording = True

    def on_release(key):
        nonlocal recording
        if key == keyboard.Key.shift:
            recording = False
            return False

    listener = keyboard.Listener(
        on_press=on_press,
        on_release=on_release)
    listener.start()

    print('Press shift to record...')
    while not recording:
        time.sleep(0.1)
    print('Start recording...')

    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, frames_per_buffer=1024, input=True)
    frames = []
    while recording:
        data = stream.read(1024, exception_on_overflow = False)
        frames.append(np.frombuffer(data, dtype=np.int16))
    print('Finished recording')

    concatenated_frames = np.hstack(frames)
    data = concatenated_frames.astype(np.float32) / 32768.0

    result = asr_model.transcribe(data, fp16=False)['text']
    stream.stop_stream()
    stream.close()
    p.terminate()
    return result

def conversation():
    conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE_ES}]
    while True:
        user_input = record_and_transcribe_audio()
        conversation_history.append({'role': 'user', 'content': user_input})

        response = llm_client.chat.completions.create(model="mistral", messages=conversation_history)
        chatbot_response = response.choices[0].message.content
        conversation_history.append({'role': 'assistant', 'content': chatbot_response})
        print(conversation_history)
        play_audio(chatbot_response)

        if len(conversation_history) > 20:
            conversation_history = conversation_history[-20:]

conversation()
	""" Para usar: instale LLM studio (o Ollama), clone OpenVoice, ejecute este script en el directorio OpenVoice
	git clone https://github.com/myshell-ai/OpenVoice
	cd OpenVoice
	git clone https://huggingface.co/myshell-ai/OpenVoice
	cp -r OpenVoice/* .
	pip install whisper pynput pyaudio

	este proyecto se base en el script compartido por thomwolf en https://gist.github.com/thomwolf/e9c3f978d0f82600a7c24cb0bf80d606
	"""

	from openai import OpenAI
	import time
	import pyaudio
	import numpy as np
	import torch
	import os
	import re
	import se_extractor
	import whisper
	from pynput import keyboard
	from api import BaseSpeakerTTS, ToneColorConverter
	from utils import split_sentences_latin

	from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
	from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
	import IPython.display as ipd
	from playsound import playsound
	from scipy.io.wavfile import write as write_wav

	models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
	"facebook/tts_transformer-es-css10",
	arg_overrides={"vocoder": "hifigan", "fp16": False}
	)

	model = models[0]
	TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
	generator = task.build_generator([model], cfg)


	# SYSTEM_MESSAGE = "You are Bob an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
	SYSTEM_MESSAGE_ES = "Eres Bob un asistente de IA. MANTÉN TUS RESPUESTAS MUY CORTAS Y CONVERSACIONALES."
	SPEAKER_WAV = None

	llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

	tts_en_ckpt_base = os.path.join(os.path.dirname(__file__), "checkpoints/base_speakers/EN")
	tts_ckpt_converter = os.path.join(os.path.dirname(__file__), "checkpoints/converter")
	device = "cpu"

	tts_model = BaseSpeakerTTS(f'{tts_en_ckpt_base}/config.json', device=device)
	tts_model.load_ckpt(f'{tts_en_ckpt_base}/checkpoint.pth')

	tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device)
	tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth')

	en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device)

	target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else (None, None)

	sampling_rate = tts_model.hps.data.sampling_rate
	mark = tts_model.language_marks.get("english", None)

	asr_model = whisper.load_model("small")

	def sound(wav, fs=8000):
	p = pyaudio.PyAudio()
	stream = p.open(format=pyaudio.paInt16, channels=len(wav.shape), rate=fs, output=True)
	stream.write(wav.tobytes())
	stream.stop_stream()
	stream.close()
	p.terminate()


	def play_audio(text):
	sample = TTSHubInterface.get_model_input(task, text)
	wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
	wav_cpu = wav.to('cpu')
	write_wav("audio1.wav", rate, wav_cpu.numpy())
	playsound("audio1.wav")

	def record_and_transcribe_audio():
	recording = False
	def on_press(key):
	nonlocal recording
	if key == keyboard.Key.shift:
	recording = True

	def on_release(key):
	nonlocal recording
	if key == keyboard.Key.shift:
	recording = False
	return False

	listener = keyboard.Listener(
	on_press=on_press,
	on_release=on_release)
	listener.start()

	print('Press shift to record...')
	while not recording:
	time.sleep(0.1)
	print('Start recording...')

	p = pyaudio.PyAudio()
	stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, frames_per_buffer=1024, input=True)
	frames = []
	while recording:
	data = stream.read(1024, exception_on_overflow = False)
	frames.append(np.frombuffer(data, dtype=np.int16))
	print('Finished recording')

	concatenated_frames = np.hstack(frames)
	data = concatenated_frames.astype(np.float32) / 32768.0

	result = asr_model.transcribe(data, fp16=False)['text']
	stream.stop_stream()
	stream.close()
	p.terminate()
	return result

	def conversation():
	conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE_ES}]
	while True:
	user_input = record_and_transcribe_audio()
	conversation_history.append({'role': 'user', 'content': user_input})

	response = llm_client.chat.completions.create(model="mistral", messages=conversation_history)
	chatbot_response = response.choices[0].message.content
	conversation_history.append({'role': 'assistant', 'content': chatbot_response})
	print(conversation_history)
	play_audio(chatbot_response)

	if len(conversation_history) > 20:
	conversation_history = conversation_history[-20:]

	conversation()