r0yfire/chat.py

## chat.py
"""

Originally posted on:
https://royfirestein.com/blog/real-time-voice-chat-with-ai

"""
import os
import wave
from pydub import AudioSegment
from groq import Groq
from whispercpp import Whisper
from elevenlabs import generate, stream
import pyaudio


# Initialize the Whisper client
whisper = Whisper('tiny')


# Set the API keys
os.environ["ELEVEN_API_KEY"] = "YOUR API KEY"
os.environ["GROQ_API_KEY"] = "YOUR API KEY"


# Create API clients
groq_client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)


# Set the system prompt
SYSTEM_PROMPT = "\n".join([
    "You are a friendly hotel frontdesk agent. You are here to help guests with their problems.",
    "Your responses must be very short. All of your responses must be coversational as if speaking to someone.",
    "Check-in is available after 3 PM, and check out is at 11 the next day."
])


# Output directory
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)


def play_speech(prompt):
    audio_stream = generate(
      text=prompt,
      model="eleven_multilingual_v2",
      voice="Rachel",
      stream=True,
    )
    stream(audio_stream)


def llm_chat(user_input, chat_history, bot_name):

    # Add the user input to the chat history
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        *chat_history,
        {"role": "user", "content": user_input}
    ]

    # Create the chat completion
    chat_completion = groq_client.chat.completions.create(
        messages=messages,
        model="mixtral-8x7b-32768"
    )

    # Extract the LLM response
    response = chat_completion.choices[0].message.content
    print(f"{bot_name}: {response}")

    return response


def transcribe_audio(audio_file):

    # Transcribe the audio
    result = whisper.transcribe(audio_file)

    # Extract the transcription
    texts = whisper.extract_text(result)

    return " ".join([text.lower() for text in texts if text.strip()])


def record_audio(file_path):
    p = pyaudio.PyAudio()

    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 44100
    CHUNK = 512
    RECORD_SECONDS = 5

    stream = p.open(
        format=FORMAT,
        channels=CHANNELS,
        rate=RATE,
        input=True,
        frames_per_buffer=CHUNK
    )
    frames = []

    print("Recording...")

    try:
        for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
            data = stream.read(CHUNK)
            frames.append(data)
    except KeyboardInterrupt:
        pass
    except Exception as e:
        print(f"Error while recording: {e}")
        raise e

    print("Recording complete.")

    # Close the stream
    stream.stop_stream()
    stream.close()
    p.terminate()

    # Modify the audio file
    wf = wave.open(file_path, 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

def converse():
    audio_file = "recording.wav"
    chat_history = []

    play_speech("Hello, welcome to SkyLounge Hotel. How can I help you today?")

    while True:

        # Record the user's audio
        record_audio(audio_file)

        # Transcribe the user's audio
        user_speech = transcribe_audio(audio_file)

        # # Delete the temp audio file
        os.remove(audio_file)

        if user_speech.lower() == "exit":
            break

        # Add the user's speech to the chat history
        chat_history.append({"role": "user", "content": user_speech})
        print(f"You: {user_speech}")

        # Send the user's speech to the LLM
        bot_response = llm_chat(user_speech, chat_history, "Bot")

        # Append the LLM response to the chat history
        chat_history.append({"role": "assistant", "content": bot_response})

        # Play the LLM response using text-to-speech
        play_speech(bot_response)

        # Remove old chats from the chat history
        if len(chat_history) > 20:
            chat_history = chat_history[-20:]


if __name__ == "__main__":
    converse()
	"""

	Originally posted on:
	https://royfirestein.com/blog/real-time-voice-chat-with-ai

	"""
	import os
	import wave
	from pydub import AudioSegment
	from groq import Groq
	from whispercpp import Whisper
	from elevenlabs import generate, stream
	import pyaudio


	# Initialize the Whisper client
	whisper = Whisper('tiny')


	# Set the API keys
	os.environ["ELEVEN_API_KEY"] = "YOUR API KEY"
	os.environ["GROQ_API_KEY"] = "YOUR API KEY"


	# Create API clients
	groq_client = Groq(
	api_key=os.environ.get("GROQ_API_KEY"),
	)


	# Set the system prompt
	SYSTEM_PROMPT = "\n".join([
	"You are a friendly hotel frontdesk agent. You are here to help guests with their problems.",
	"Your responses must be very short. All of your responses must be coversational as if speaking to someone.",
	"Check-in is available after 3 PM, and check out is at 11 the next day."
	])


	# Output directory
	output_dir = 'output'
	os.makedirs(output_dir, exist_ok=True)


	def play_speech(prompt):
	audio_stream = generate(
	text=prompt,
	model="eleven_multilingual_v2",
	voice="Rachel",
	stream=True,
	)
	stream(audio_stream)


	def llm_chat(user_input, chat_history, bot_name):

	# Add the user input to the chat history
	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	*chat_history,
	{"role": "user", "content": user_input}
	]

	# Create the chat completion
	chat_completion = groq_client.chat.completions.create(
	messages=messages,
	model="mixtral-8x7b-32768"
	)

	# Extract the LLM response
	response = chat_completion.choices[0].message.content
	print(f"{bot_name}: {response}")

	return response


	def transcribe_audio(audio_file):

	# Transcribe the audio
	result = whisper.transcribe(audio_file)

	# Extract the transcription
	texts = whisper.extract_text(result)

	return " ".join([text.lower() for text in texts if text.strip()])


	def record_audio(file_path):
	p = pyaudio.PyAudio()

	FORMAT = pyaudio.paInt16
	CHANNELS = 1
	RATE = 44100
	CHUNK = 512
	RECORD_SECONDS = 5

	stream = p.open(
	format=FORMAT,
	channels=CHANNELS,
	rate=RATE,
	input=True,
	frames_per_buffer=CHUNK
	)
	frames = []

	print("Recording...")

	try:
	for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
	data = stream.read(CHUNK)
	frames.append(data)
	except KeyboardInterrupt:
	pass
	except Exception as e:
	print(f"Error while recording: {e}")
	raise e

	print("Recording complete.")

	# Close the stream
	stream.stop_stream()
	stream.close()
	p.terminate()

	# Modify the audio file
	wf = wave.open(file_path, 'wb')
	wf.setnchannels(1)
	wf.setsampwidth(p.get_sample_size(FORMAT))
	wf.setframerate(RATE)
	wf.writeframes(b''.join(frames))
	wf.close()

	def converse():
	audio_file = "recording.wav"
	chat_history = []

	play_speech("Hello, welcome to SkyLounge Hotel. How can I help you today?")

	while True:

	# Record the user's audio
	record_audio(audio_file)

	# Transcribe the user's audio
	user_speech = transcribe_audio(audio_file)

	# # Delete the temp audio file
	os.remove(audio_file)

	if user_speech.lower() == "exit":
	break

	# Add the user's speech to the chat history
	chat_history.append({"role": "user", "content": user_speech})
	print(f"You: {user_speech}")

	# Send the user's speech to the LLM
	bot_response = llm_chat(user_speech, chat_history, "Bot")

	# Append the LLM response to the chat history
	chat_history.append({"role": "assistant", "content": bot_response})

	# Play the LLM response using text-to-speech
	play_speech(bot_response)

	# Remove old chats from the chat history
	if len(chat_history) > 20:
	chat_history = chat_history[-20:]


	if __name__ == "__main__":
	converse()