Skip to content

Instantly share code, notes, and snippets.

@jjesusfilho
Forked from janakiramm/Oscar_bot.py
Created November 24, 2023 16:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jjesusfilho/086d319fe526e5427132acd9cd571ab3 to your computer and use it in GitHub Desktop.
Save jjesusfilho/086d319fe526e5427132acd9cd571ab3 to your computer and use it in GitHub Desktop.
Implementing RAG with OpenAI
import openai
import tiktoken
from scipy import spatial
import pandas as pd
df=pd.read_csv('./data/oscars.csv')
print(df.head())
df=df.loc[df['year_ceremony'] == 2023]
df=df.dropna(subset=['film'])
df['category'] = df['category'].str.lower()
df.head()
df['text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' to win the award'
df.loc[df['winner'] == False, 'text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' but did not win'
print(df.head()['text'])
df=df.assign(embedding=(df["text"].apply(lambda x : text_embedding(x))))
print(df.head())
def strings_ranked_by_relatedness(
query: str,
df: pd.DataFrame,
relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
top_n: int = 100
) -> tuple[list[str], list[float]]:
EMBEDDING_MODEL = "text-embedding-ada-002"
query_embedding_response = openai.Embedding.create(
model=EMBEDDING_MODEL,
input=query,
)
query_embedding = query_embedding_response["data"][0]["embedding"]
strings_and_relatednesses = [
(row["text"], relatedness_fn(query_embedding, row["embedding"]))
for i, row in df.iterrows()
]
strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
strings, relatednesses = zip(*strings_and_relatednesses)
return strings[:top_n], relatednesses[:top_n]
strings, relatednesses = strings_ranked_by_relatedness("Lady Gaga", df, top_n=3)
for string, relatedness in zip(strings, relatednesses):
print(f"{relatedness=:.3f}")
display(string)
def num_tokens(text: str) -> int:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
return len(encoding.encode(text))
def query_message(
query: str,
df: pd.DataFrame,
model: str,
token_budget: int
) -> str:
strings, relatednesses = strings_ranked_by_relatedness(query, df)
introduction = 'Use the below content related to 95th Oscar awards to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."'
question = f"\n\nQuestion: {query}"
message = introduction
for string in strings:
next_row = f'\n\nOscar database section:\n"""\n{string}\n"""'
if (
num_tokens(message + next_row + question)
> token_budget
):
break
else:
message += next_row
return message + question
def ask(
query: str,
df: pd.DataFrame = df,
model: str = "gpt-3.5-turbo",
token_budget: int = 4096 - 500,
print_message: bool = False,
) -> str:
message = query_message(query, df, model=model, token_budget=token_budget)
if print_message:
print(message)
messages = [
{"role": "system", "content": "You answer questions about 95th Oscar awards."},
{"role": "user", "content": message},
]
response = openai.ChatCompletion.create(
model=model,
messages=messages,
temperature=0
)
response_message = response["choices"][0]["message"]["content"]
return response_message
print(ask('What was the nomination from Lady Gaga for the 95th Oscars?'))
print(ask('What were the nominations for the music awards?'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment