Created
June 12, 2024 00:24
-
-
Save 0xrushi/e56085f93698c7267af9b1ba9643dc7a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Uses OPENAI API KEY | |
import time | |
import threading | |
import numpy as np | |
import sounddevice as sd | |
from queue import Queue | |
from rich.console import Console | |
from langchain.memory import ConversationBufferMemory | |
from langchain.chains import ConversationChain | |
from langchain.prompts import PromptTemplate | |
from langchain_community.llms import Ollama | |
from scipy.io.wavfile import write | |
from openai import OpenAI | |
from interpreter import interpreter | |
from langchain_community import llms | |
console = Console() | |
def save_numpy_to_wav(numpy_array, file_path, sample_rate=16000): | |
write(file_path, sample_rate, numpy_array) | |
template = """ | |
You are an AI assistant that helps users convert their broken English descriptions into a sequence of tasks. | |
Analyze if the statement says multiple tasks or a single. If there are multiple tasks, write a plan. **Always recap the plan between each code block** (you have extreme short-term memory loss, so you need to recap the plan between each message block to retain it). | |
When you execute code, it will be executed **on the user's machine**. The user has given you **full and complete permission** to execute any code necessary to complete the task. Execute the code. | |
You can access the internet. Run **any code** to achieve the goal, and if at first you don't succeed, try again and again. | |
You can install new packages. | |
When a user refers to a filename, they're likely referring to an existing file in the directory you're currently executing code in. | |
In general, try to **make plans** with as few steps as possible. As for actually executing code to carry out that plan, for *stateful* languages (like python, javascript, shell, but NOT for html which starts from 0 every time). | |
You are capable of **any** task. | |
The conversation transcript is as follows: | |
{relevant_commands} | |
And here is the user's follow-up: {instruction} | |
Your response: | |
""" | |
prompt = PromptTemplate(input_variables=["instruction", "relevant_commands"], template=template) | |
class stt: | |
def transcribe(audio_array, **kwargs): | |
save_numpy_to_wav(audio_array, "output.wav") | |
client = OpenAI() | |
with open("output.wav", "rb") as audio_file: | |
transcription = client.audio.transcriptions.create(model="whisper-1", file=audio_file) | |
print(transcription.text) | |
return transcription.text | |
def record_audio(stop_event, data_queue): | |
""" | |
Captures audio data from the user's microphone and adds it to a queue for further processing. | |
Args: | |
stop_event (threading.Event): An event that, when set, signals the function to stop recording. | |
data_queue (queue.Queue): A queue to which the recorded audio data will be added. | |
Returns: | |
None | |
""" | |
def callback(indata, frames, time, status): | |
if status: | |
console.print(status) | |
data_queue.put(bytes(indata)) | |
with sd.RawInputStream( | |
samplerate=16000, dtype="int16", channels=1, callback=callback | |
): | |
while not stop_event.is_set(): | |
time.sleep(0.1) | |
def transcribe(audio_np: np.ndarray) -> str: | |
""" | |
Transcribes the given audio data using the Whisper speech recognition model. | |
Args: | |
audio_np (numpy.ndarray): The audio data to be transcribed. | |
Returns: | |
str: The transcribed text. | |
""" | |
result = stt.transcribe(audio_np) # Set fp16=True if using a GPU | |
text = result | |
return text | |
def get_llm_response(text: str) -> str: | |
""" | |
Generates a response to the given text using the Llama-2 language model. | |
Args: | |
text (str): The input text to be processed. | |
Returns: | |
str: The generated response. | |
""" | |
filled_prompt = prompt.format(instruction=text, relevant_commands="") | |
interpreter.auto_run = True | |
chunk = interpreter.chat(filled_prompt, display=True, stream=False) | |
return (chunk[0])["content"] | |
def play_audio(sample_rate, audio_array): | |
""" | |
Plays the given audio data using the sounddevice library. | |
Args: | |
sample_rate (int): The sample rate of the audio data. | |
audio_array (numpy.ndarray): The audio data to be played. | |
Returns: | |
None | |
""" | |
sd.play(audio_array, sample_rate) | |
sd.wait() | |
if __name__ == "__main__": | |
console.print("[cyan]Assistant started! Press Ctrl+C to exit.") | |
try: | |
while True: | |
console.input( | |
"Press Enter to start recording, then press Enter again to stop." | |
) | |
data_queue = Queue() # type: ignore[var-annotated] | |
stop_event = threading.Event() | |
recording_thread = threading.Thread( | |
target=record_audio, | |
args=(stop_event, data_queue), | |
) | |
recording_thread.start() | |
input() | |
stop_event.set() | |
recording_thread.join() | |
audio_data = b"".join(list(data_queue.queue)) | |
audio_np = ( | |
np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0 | |
) | |
if audio_np.size > 0: | |
with console.status("Transcribing...", spinner="earth"): | |
text = transcribe(audio_np) | |
console.print(f"[yellow]You: {text}") | |
with console.status("Generating response...", spinner="earth"): | |
response = get_llm_response(text) | |
# sample_rate, audio_array = tts.long_form_synthesize(response) | |
print(response) | |
console.print(f"[cyan]Assistant: {response}") | |
# play_audio(44000, audio_array) | |
else: | |
console.print( | |
"[red]No audio recorded. Please ensure your microphone is working." | |
) | |
except KeyboardInterrupt: | |
console.print("\n[red]Exiting...") | |
console.print("[blue]Session ended.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment