Last active
August 6, 2024 02:34
-
-
Save nheingit/c0c4781812970eb0fa21d737c0ff99d7 to your computer and use it in GitHub Desktop.
questions.py and main.py for day 2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import logging | |
from telegram import Update | |
from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler | |
from openai import OpenAI | |
import pandas as pd | |
import numpy as np | |
from questions import answer_question | |
openai = OpenAI(api_key=os.environ['OPENAI_API_KEY']) | |
tg_bot_token = os.environ['TG_BOT_TOKEN'] | |
# Get the directory of the current script | |
current_dir = os.path.dirname(os.path.abspath(__file__)) | |
# Construct the absolute path to the CSV file | |
csv_path = os.path.join(current_dir, "processed", "embeddings.csv") | |
df = pd.read_csv(csv_path, index_col=0) | |
df["embeddings"] = df["embeddings"].apply(eval).apply(np.array) | |
messages = [{ | |
"role": "system", | |
"content": "You are a helpful assistant that answers questions." | |
}] | |
logging.basicConfig( | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
level=logging.INFO) | |
async def chat(update: Update, context: ContextTypes.DEFAULT_TYPE): | |
messages.append({"role": "user", "content": update.message.text}) | |
completion = openai.chat.completions.create(model="gpt-3.5-turbo", | |
messages=messages) | |
completion_answer = completion.choices[0].message | |
messages.append(completion_answer) | |
await context.bot.send_message(chat_id=update.effective_chat.id, | |
text=completion_answer.content) | |
async def mozilla(update: Update, context: ContextTypes.DEFAULT_TYPE): | |
answer = answer_question(df, question=update.message.text, debug=True) | |
await context.bot.send_message(chat_id=update.effective_chat.id, text=answer) | |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE): | |
await context.bot.send_message(chat_id=update.effective_chat.id, | |
text="I'm a bot, please talk to me!") | |
if __name__ == '__main__': | |
application = ApplicationBuilder().token(tg_bot_token).build() | |
start_handler = CommandHandler('start', start) | |
chat_handler = CommandHandler('chat', chat) | |
mozilla_handler = CommandHandler('mozilla', mozilla) | |
application.add_handler(mozilla_handler) | |
application.add_handler(start_handler) | |
application.add_handler(chat_handler) | |
application.run_polling() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from openai import OpenAI | |
import os | |
from typing import List | |
from scipy import spatial | |
def distances_from_embeddings( | |
query_embedding: List[float], | |
embeddings: List[List[float]], | |
distance_metric="cosine", | |
) -> List[float]: | |
"""Return the distances between a query embedding and a list of embeddings.""" | |
distance_metrics = { | |
"cosine": spatial.distance.cosine, | |
"L1": spatial.distance.cityblock, | |
"L2": spatial.distance.euclidean, | |
"Linf": spatial.distance.chebyshev, | |
} | |
distances = [ | |
distance_metrics[distance_metric](query_embedding, embedding) | |
for embedding in embeddings | |
] | |
return distances | |
openai = OpenAI(api_key=os.environ['OPENAI_API_KEY']) | |
df = pd.read_csv('processed/embeddings.csv', index_col=0) | |
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array) | |
def create_context(question, df, max_len=1800): | |
""" | |
Create a context for a question by finding the most similar context from the dataframe | |
""" | |
# Get the embeddings for the question | |
q_embeddings = openai.embeddings.create( | |
input=question, model='text-embedding-ada-002').data[0].embedding | |
# Get the distances from the embeddings | |
df['distances'] = distances_from_embeddings(q_embeddings, | |
df['embeddings'].values, | |
distance_metric='cosine') | |
returns = [] | |
cur_len = 0 | |
# Sort by distance and add the text to the context until the context is too long | |
for i, row in df.sort_values('distances', ascending=True).iterrows(): | |
# Add the length of the text to the current length | |
cur_len += row['n_tokens'] + 4 | |
# If the context is too long, break | |
if cur_len > max_len: | |
break | |
# Else add it to the text that is being returned | |
returns.append(row["text"]) | |
# Return the context | |
return "\n\n###\n\n".join(returns) | |
def answer_question(df, | |
model="gpt-3.5-turbo-1106", | |
question="What is the meaning of life?", | |
max_len=1800, | |
debug=False, | |
max_tokens=150, | |
stop_sequence=None): | |
""" | |
Answer a question based on the most similar context from the dataframe texts | |
""" | |
context = create_context( | |
question, | |
df, | |
max_len=max_len, | |
) | |
# If debug, print the raw model response | |
if debug: | |
print("Context:\n" + context) | |
print("\n\n") | |
try: | |
# Create a completions using the question and context | |
response = openai.chat.completions.create( | |
model=model, | |
messages=[{ | |
"role": | |
"user", | |
"content": | |
f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know.\" Try to site sources to the links in the context when possible.\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:", | |
}], | |
temperature=0, | |
max_tokens=max_tokens, | |
top_p=1, | |
frequency_penalty=0, | |
presence_penalty=0, | |
stop=stop_sequence, | |
) | |
return response.choices[0].message.content | |
except Exception as e: | |
print(e) | |
return "" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
it seems like reading the embeddings in
questions.py
intodf
isn't needed since we always pass it to the function