Skip to content

Instantly share code, notes, and snippets.

@thomwolf
Last active September 10, 2024 19:39
Show Gist options
  • Save thomwolf/e9c3f978d0f82600a7c24cb0bf80d606 to your computer and use it in GitHub Desktop.
Save thomwolf/e9c3f978d0f82600a7c24cb0bf80d606 to your computer and use it in GitHub Desktop.
speech to text to speech
""" To use: install LLM studio (or Ollama), clone OpenVoice, run this script in the OpenVoice directory
git clone https://github.com/myshell-ai/OpenVoice
cd OpenVoice
git clone https://huggingface.co/myshell-ai/OpenVoice
cp -r OpenVoice/* .
pip install whisper pynput pyaudio
"""
from openai import OpenAI
import time
import pyaudio
import numpy as np
import torch
import os
import re
import se_extractor
import whisper
from pynput import keyboard
from api import BaseSpeakerTTS, ToneColorConverter
from utils import split_sentences_latin
SYSTEM_MESSAGE = "You are Bob an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
SPEAKER_WAV = None
llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
tts_en_ckpt_base = os.path.join(os.path.dirname(__file__), "checkpoints/base_speakers/EN")
tts_ckpt_converter = os.path.join(os.path.dirname(__file__), "checkpoints/converter")
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
tts_model = BaseSpeakerTTS(f'{tts_en_ckpt_base}/config.json', device=device)
tts_model.load_ckpt(f'{tts_en_ckpt_base}/checkpoint.pth')
tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth')
en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device)
target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else (None, None)
sampling_rate = tts_model.hps.data.sampling_rate
mark = tts_model.language_marks.get("english", None)
asr_model = whisper.load_model("base.en")
def play_audio(text):
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paFloat32, channels=1, rate=sampling_rate, output=True)
texts = split_sentences_latin(text)
for t in texts:
audio_list = []
t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
t = f'[{mark}]{t}[{mark}]'
stn_tst = tts_model.get_text(t, tts_model.hps, False)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0).to(tts_model.device)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device)
sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device)
audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy()
if target_se is not None:
audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se)
audio_list.append(audio)
data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes()
stream.write(data)
stream.stop_stream()
stream.close()
p.terminate()
def record_and_transcribe_audio():
recording = False
def on_press(key):
nonlocal recording
if key == keyboard.Key.shift:
recording = True
def on_release(key):
nonlocal recording
if key == keyboard.Key.shift:
recording = False
return False
listener = keyboard.Listener(
on_press=on_press,
on_release=on_release)
listener.start()
print('Press shift to record...')
while not recording:
time.sleep(0.1)
print('Start recording...')
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, frames_per_buffer=1024, input=True)
frames = []
while recording:
data = stream.read(1024, exception_on_overflow = False)
frames.append(np.frombuffer(data, dtype=np.int16))
print('Finished recording')
data = np.hstack(frames, dtype=np.float32) / 32768.0
result = asr_model.transcribe(data)['text']
stream.stop_stream()
stream.close()
p.terminate()
return result
def conversation():
conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE}]
while True:
user_input = record_and_transcribe_audio()
conversation_history.append({'role': 'user', 'content': user_input})
response = llm_client.chat.completions.create(model="local-model", messages=conversation_history)
chatbot_response = response.choices[0].message.content
conversation_history.append({'role': 'assistant', 'content': chatbot_response})
print(conversation_history)
play_audio(chatbot_response)
if len(conversation_history) > 20:
conversation_history = conversation_history[-20:]
conversation()
@iddar
Copy link

iddar commented Feb 16, 2024

someone tried to run this in ubuntu-wsl and managed to get the input from the keyboard?

try to Change the logic to use any other human input like a new line o std in

@NVBCWT
Copy link

NVBCWT commented Feb 28, 2024

For MAC users: M1/M2 pro

  1. Before installing pyaudio make sure you do - brew install portaudio
  2. np.hstack(frames).astype(np.float32) - Thanks to @ajram23 above
  3. Make sure you have git-lfs else checkpoints from huggingface won't be downloaded properly and you would end up getting some pkl error
  4. Use requirements.txt from OpenVoice github folder to install all dependencies in one go and then additionally - pip install whisper pynput pyaudio
  5. I used LLM studio with mistral as backend, make sure to start server there within local inference server

@haseeb-heaven
Copy link

Thats lot of requirements seems like to complex project.
But it is working on my MacBook M1 now after so much complications

@NVBCWT
Copy link

NVBCWT commented Feb 28, 2024

Yes it does on my system too. Would make changes to it , integrate speech brain probably

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment