Skip to content

Instantly share code, notes, and snippets.

@nheingit
Last active August 6, 2024 02:34
Show Gist options
  • Save nheingit/c0c4781812970eb0fa21d737c0ff99d7 to your computer and use it in GitHub Desktop.
Save nheingit/c0c4781812970eb0fa21d737c0ff99d7 to your computer and use it in GitHub Desktop.
questions.py and main.py for day 2
import os
import logging
from telegram import Update
from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler
from openai import OpenAI
import pandas as pd
import numpy as np
from questions import answer_question
openai = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
tg_bot_token = os.environ['TG_BOT_TOKEN']
# Get the directory of the current script
current_dir = os.path.dirname(os.path.abspath(__file__))
# Construct the absolute path to the CSV file
csv_path = os.path.join(current_dir, "processed", "embeddings.csv")
df = pd.read_csv(csv_path, index_col=0)
df["embeddings"] = df["embeddings"].apply(eval).apply(np.array)
messages = [{
"role": "system",
"content": "You are a helpful assistant that answers questions."
}]
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.INFO)
async def chat(update: Update, context: ContextTypes.DEFAULT_TYPE):
messages.append({"role": "user", "content": update.message.text})
completion = openai.chat.completions.create(model="gpt-3.5-turbo",
messages=messages)
completion_answer = completion.choices[0].message
messages.append(completion_answer)
await context.bot.send_message(chat_id=update.effective_chat.id,
text=completion_answer.content)
async def mozilla(update: Update, context: ContextTypes.DEFAULT_TYPE):
answer = answer_question(df, question=update.message.text, debug=True)
await context.bot.send_message(chat_id=update.effective_chat.id, text=answer)
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
await context.bot.send_message(chat_id=update.effective_chat.id,
text="I'm a bot, please talk to me!")
if __name__ == '__main__':
application = ApplicationBuilder().token(tg_bot_token).build()
start_handler = CommandHandler('start', start)
chat_handler = CommandHandler('chat', chat)
mozilla_handler = CommandHandler('mozilla', mozilla)
application.add_handler(mozilla_handler)
application.add_handler(start_handler)
application.add_handler(chat_handler)
application.run_polling()
import numpy as np
import pandas as pd
from openai import OpenAI
import os
from typing import List
from scipy import spatial
def distances_from_embeddings(
query_embedding: List[float],
embeddings: List[List[float]],
distance_metric="cosine",
) -> List[float]:
"""Return the distances between a query embedding and a list of embeddings."""
distance_metrics = {
"cosine": spatial.distance.cosine,
"L1": spatial.distance.cityblock,
"L2": spatial.distance.euclidean,
"Linf": spatial.distance.chebyshev,
}
distances = [
distance_metrics[distance_metric](query_embedding, embedding)
for embedding in embeddings
]
return distances
openai = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
df = pd.read_csv('processed/embeddings.csv', index_col=0)
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)
def create_context(question, df, max_len=1800):
"""
Create a context for a question by finding the most similar context from the dataframe
"""
# Get the embeddings for the question
q_embeddings = openai.embeddings.create(
input=question, model='text-embedding-ada-002').data[0].embedding
# Get the distances from the embeddings
df['distances'] = distances_from_embeddings(q_embeddings,
df['embeddings'].values,
distance_metric='cosine')
returns = []
cur_len = 0
# Sort by distance and add the text to the context until the context is too long
for i, row in df.sort_values('distances', ascending=True).iterrows():
# Add the length of the text to the current length
cur_len += row['n_tokens'] + 4
# If the context is too long, break
if cur_len > max_len:
break
# Else add it to the text that is being returned
returns.append(row["text"])
# Return the context
return "\n\n###\n\n".join(returns)
def answer_question(df,
model="gpt-3.5-turbo-1106",
question="What is the meaning of life?",
max_len=1800,
debug=False,
max_tokens=150,
stop_sequence=None):
"""
Answer a question based on the most similar context from the dataframe texts
"""
context = create_context(
question,
df,
max_len=max_len,
)
# If debug, print the raw model response
if debug:
print("Context:\n" + context)
print("\n\n")
try:
# Create a completions using the question and context
response = openai.chat.completions.create(
model=model,
messages=[{
"role":
"user",
"content":
f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know.\" Try to site sources to the links in the context when possible.\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
}],
temperature=0,
max_tokens=max_tokens,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
stop=stop_sequence,
)
return response.choices[0].message.content
except Exception as e:
print(e)
return ""
@dannyhw
Copy link

dannyhw commented May 5, 2024

it seems like reading the embeddings in questions.py into df isn't needed since we always pass it to the function

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment