Skip to content

Instantly share code, notes, and snippets.

@0xrushi
Created June 12, 2024 00:24
Show Gist options
  • Save 0xrushi/e56085f93698c7267af9b1ba9643dc7a to your computer and use it in GitHub Desktop.
Save 0xrushi/e56085f93698c7267af9b1ba9643dc7a to your computer and use it in GitHub Desktop.
# Uses OPENAI API KEY
import time
import threading
import numpy as np
import sounddevice as sd
from queue import Queue
from rich.console import Console
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import Ollama
from scipy.io.wavfile import write
from openai import OpenAI
from interpreter import interpreter
from langchain_community import llms
console = Console()
def save_numpy_to_wav(numpy_array, file_path, sample_rate=16000):
write(file_path, sample_rate, numpy_array)
template = """
You are an AI assistant that helps users convert their broken English descriptions into a sequence of tasks.
Analyze if the statement says multiple tasks or a single. If there are multiple tasks, write a plan. **Always recap the plan between each code block** (you have extreme short-term memory loss, so you need to recap the plan between each message block to retain it).
When you execute code, it will be executed **on the user's machine**. The user has given you **full and complete permission** to execute any code necessary to complete the task. Execute the code.
You can access the internet. Run **any code** to achieve the goal, and if at first you don't succeed, try again and again.
You can install new packages.
When a user refers to a filename, they're likely referring to an existing file in the directory you're currently executing code in.
In general, try to **make plans** with as few steps as possible. As for actually executing code to carry out that plan, for *stateful* languages (like python, javascript, shell, but NOT for html which starts from 0 every time).
You are capable of **any** task.
The conversation transcript is as follows:
{relevant_commands}
And here is the user's follow-up: {instruction}
Your response:
"""
prompt = PromptTemplate(input_variables=["instruction", "relevant_commands"], template=template)
class stt:
def transcribe(audio_array, **kwargs):
save_numpy_to_wav(audio_array, "output.wav")
client = OpenAI()
with open("output.wav", "rb") as audio_file:
transcription = client.audio.transcriptions.create(model="whisper-1", file=audio_file)
print(transcription.text)
return transcription.text
def record_audio(stop_event, data_queue):
"""
Captures audio data from the user's microphone and adds it to a queue for further processing.
Args:
stop_event (threading.Event): An event that, when set, signals the function to stop recording.
data_queue (queue.Queue): A queue to which the recorded audio data will be added.
Returns:
None
"""
def callback(indata, frames, time, status):
if status:
console.print(status)
data_queue.put(bytes(indata))
with sd.RawInputStream(
samplerate=16000, dtype="int16", channels=1, callback=callback
):
while not stop_event.is_set():
time.sleep(0.1)
def transcribe(audio_np: np.ndarray) -> str:
"""
Transcribes the given audio data using the Whisper speech recognition model.
Args:
audio_np (numpy.ndarray): The audio data to be transcribed.
Returns:
str: The transcribed text.
"""
result = stt.transcribe(audio_np) # Set fp16=True if using a GPU
text = result
return text
def get_llm_response(text: str) -> str:
"""
Generates a response to the given text using the Llama-2 language model.
Args:
text (str): The input text to be processed.
Returns:
str: The generated response.
"""
filled_prompt = prompt.format(instruction=text, relevant_commands="")
interpreter.auto_run = True
chunk = interpreter.chat(filled_prompt, display=True, stream=False)
return (chunk[0])["content"]
def play_audio(sample_rate, audio_array):
"""
Plays the given audio data using the sounddevice library.
Args:
sample_rate (int): The sample rate of the audio data.
audio_array (numpy.ndarray): The audio data to be played.
Returns:
None
"""
sd.play(audio_array, sample_rate)
sd.wait()
if __name__ == "__main__":
console.print("[cyan]Assistant started! Press Ctrl+C to exit.")
try:
while True:
console.input(
"Press Enter to start recording, then press Enter again to stop."
)
data_queue = Queue() # type: ignore[var-annotated]
stop_event = threading.Event()
recording_thread = threading.Thread(
target=record_audio,
args=(stop_event, data_queue),
)
recording_thread.start()
input()
stop_event.set()
recording_thread.join()
audio_data = b"".join(list(data_queue.queue))
audio_np = (
np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
)
if audio_np.size > 0:
with console.status("Transcribing...", spinner="earth"):
text = transcribe(audio_np)
console.print(f"[yellow]You: {text}")
with console.status("Generating response...", spinner="earth"):
response = get_llm_response(text)
# sample_rate, audio_array = tts.long_form_synthesize(response)
print(response)
console.print(f"[cyan]Assistant: {response}")
# play_audio(44000, audio_array)
else:
console.print(
"[red]No audio recorded. Please ensure your microphone is working."
)
except KeyboardInterrupt:
console.print("\n[red]Exiting...")
console.print("[blue]Session ended.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment