Skip to content

Instantly share code, notes, and snippets.

@TheDiscordian
Created July 2, 2023 23:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TheDiscordian/7cfb9d5a84e3c5fe15dc5184a567e4f6 to your computer and use it in GitHub Desktop.
Save TheDiscordian/7cfb9d5a84e3c5fe15dc5184a567e4f6 to your computer and use it in GitHub Desktop.
PLN QA AI Bot
import urllib.request, time, json, datetime, socket, re, tiktoken, openai, os
import numpy as np
import pandas as pd
from openai.embeddings_utils import distances_from_embeddings
# Usage: OPENAI_API_KEY="key-here" python3 pln_qa.py
MODELS = {"text-ada-001": 0.0004, "text-babbage-001": 0.0005, "text-curie-001": 0.002, "text-davinci-001": 0.02, "gpt-3.5-turbo": (0.0015, 0.002)}
model = "gpt-3.5-turbo"
DB = 'pln_db-noembed.csv'
EMBED_DB = 'pln_db.csv'
OPENAI_API_KEY = ""
# Get OpenAI API Key from env var
if 'OPENAI_API_KEY' in os.environ:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
else:
print("No OPENAI_API_KEY environment variable found, exiting...")
exit()
EXPERTISE = {"ipfs": [
"https://raw.githubusercontent.com/ipfs/ipfs-docs/main/docs/concepts/what-is-ipfs.md",
"https://raw.githubusercontent.com/ipfs/ipfs-docs/main/docs/concepts/ipfs-solves.md",
"https://raw.githubusercontent.com/ipfs/ipfs-docs/main/docs/concepts/how-ipfs-works.md",
"https://raw.githubusercontent.com/ipfs/ipfs-docs/main/docs/concepts/lifecycle.md",
"https://raw.githubusercontent.com/ipfs/ipfs-docs/main/docs/concepts/faq.md",
],
"helia": [
# General info
"https://raw.githubusercontent.com/wiki/ipfs/helia/Manifesto.md",
"https://raw.githubusercontent.com/wiki/ipfs/helia/FAQ.md",
"https://raw.githubusercontent.com/wiki/ipfs/helia/Branding.md",
"https://raw.githubusercontent.com/wiki/ipfs/helia/Meta.md",
# Development info
"https://raw.githubusercontent.com/ipfs-examples/helia-examples/main/examples/helia-101/README.md",
"https://raw.githubusercontent.com/ipfs-examples/helia-examples/main/examples/helia-101/101-basics.js",
"https://raw.githubusercontent.com/ipfs-examples/helia-examples/main/examples/helia-101/201-storage.js",
"https://raw.githubusercontent.com/ipfs-examples/helia-examples/main/examples/helia-101/301-networking.js",
"https://github.com/ipfs/helia/wiki/Migrating-from-js-IPFS.md",
]}
def update_databases():
for prod in EXPERTISE:
print("-- Updating %s --" % prod)
# check if dbs/prod exists, if not, create it
if not os.path.exists("dbs/%s" % prod):
os.makedirs("dbs/%s" % prod)
# download all the documents contained in EXPERTISE[prod] and save them in dbs/prod
for url in EXPERTISE[prod]:
filename = url.split("/")[-1]
print("Downloading %s..." % filename)
try:
page = urllib.request.urlopen(url)
text = page.read().decode("utf-8")
# only modify md files
if filename.endswith(".md"):
# remove html tags
text = re.sub(r'<[^>]*>', '', text)
# remove markdown links, preserving the link name
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
# remove all blocks which begin with ":::callout" and end with ":::"
text = re.sub(r':::callout[\s\S]*?:::', '', text)
# remove all "**", as long as it closes "**" later
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
# remove all square bracket enclosures
text = re.sub(r'\[([^\]]+)\]', r'\1', text)
open("dbs/%s/%s" % (prod, filename), "w").write(text.replace("\n\n\n", "\n\n")) # Does the replace here really help?
except Exception as e:
print(e)
time.sleep(1)
continue
time.sleep(0.25)
# reload_database()
print()
def db_to_aidb():
ai_tech_texts = []
ai_tech_files = []
# list all the files in dbs, including in subdirectories, store list in ai_tech_texts
for root, dirs, files in os.walk("dbs"):
for file in files:
ai_tech_files.append(os.path.join(root, file))
for tech_file in ai_tech_files:
# read the file
text = open(tech_file, "r").read()
title = ""
last_sub_title = ""
texts = []
if tech_file.endswith(".md"):
# find out if one of the first lines is a title
textlines = text.split("\n")
last_split = 0
for l in range(len(textlines)):
if textlines[l].startswith("# "):
title = textlines[l][2:]
if textlines[l].startswith("title: "):
title = textlines[l][7:]
# locate lines with subheadings, and break them up into more texts, stored in variable "texts"
if textlines[l].startswith("## "):
last_sub_title = title + " - " + textlines[l][3:]
texts.append((last_sub_title, '\n'.join(textlines[last_split:l])))
last_split = l
if textlines[l].startswith("### "):
if last_sub_title == "":
last_sub_title = title
texts.append((last_sub_title + " - " + textlines[l][4:], '\n'.join(textlines[last_split:l])))
last_split = l
if title == "":
# extrapolate title from filename
split_file = tech_file.split("/")
title = (split_file[-2] + " " + split_file[-1].split(".")[0].replace("-", ' ')).title()
if len(texts) == 0:
# title, text
ai_tech_texts.append((title, text)) # FIXME just storing text isn't good enough, break it up by sections defined in the md doc
else:
for t in texts:
if len(t[1].strip()) > 25:
ai_tech_texts.append((t[0], t[1]))
df = pd.DataFrame(ai_tech_texts, columns = ['title', 'text'])
df.head()
tokenizer = tiktoken.get_encoding("cl100k_base")
# df = pd.read_csv('processed/scraped.csv', index_col=0)
df.columns = ['title', 'text']
global ntokens
ntokens = 0
def get_token_count(x):
global ntokens
tokens = tokenizer.encode(x)
ntokens += len(tokens)
#if len(tokens) > 900:
# print("High token count...%d" % (len(tokens)))
return len(tokens)
df['n_tokens'] = df.text.apply(get_token_count)
print("Tokens used: %d ($%.2f to process)" % (ntokens, ntokens / 1000 * 0.0004))
inp = input("Continue? (y/N) ")
if inp.lower() != "y":
df.to_csv(DB, index=False, encoding='utf-8')
return
print("... Processing embeds do NOT stop this process for any reason! ...")
socket.setdefaulttimeout(300)
global count
count = 0
def process_embeds(x):
global count
count += 1
print("\rProcessing embeds... %.2f%%" % (count / len(df) * 100), end="")
return openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding']
df['embeddings'] = df.text.apply(process_embeds)
socket.setdefaulttimeout(10)
df.to_csv(EMBED_DB, index=False, encoding='utf-8')
df.head()
def create_context(question, max_len=1100, max_count=6, size="ada"):
"""
Create a context for a question by finding the most similar context from the dataframe
"""
global df, cheap_tokens_used
# Get the embeddings for the question
emb = openai.Embedding.create(input=question, engine='text-embedding-ada-002')
q_embeddings = emb['data'][0]['embedding']
cheap_tokens_used += emb["usage"]["total_tokens"]
# Get the distances from the embeddings
df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')
returns = []
cur_len = 0
count = 0
# Sort by distance and add the text to the context until the context is too long
for i, row in df.sort_values('distances', ascending=True).iterrows():
count += 1;
if count > max_count:
break
# Add the length of the text to the current length
cur_len += row['n_tokens'] + 4
# If the context is too long, break
if cur_len > max_len:
break
# Else add it to the text that is being returned
#returns.append("Name: %s\nDescription: %s" % (i, row["text"]))
returns.append(row["text"])
# Return the context
return returns
def answer_question(question,
model=model,
max_len=1900,
max_count=3,
size="ada",
debug=False,
max_tokens=1100,
stop_sequence=["\nExpert:", "\nUser:"]
):
"""
Answer a question based on the most similar context from the dataframe texts
"""
context = create_context(
question,
max_len=max_len,
size=size,
max_count=max_count,
)
try:
# Create a completions using the question and context
response = None
output = ""
q_lower = question.lower()
if "ipfs" in q_lower:
# This myth gets into the AI's head very easily, it's best to keep this line to explicitly prevent misinformation
context.append("IPFS doesn't distribute or replicate data across the network unless another node explicitly chooses to store the data. When you add a file to your IPFS node, it's only stored on your IPFS node until another node explicity requests the data.")
# If debug, print the raw model response
if debug:
print("Context:\n" + "\n###\n\n".join(context))
print("\n\n")
if type(MODELS[model]) == float:
context = "\n###\n\n".join(context)
response = openai.Completion.create(
prompt=f"You are a friendly expert in IPFS & Helia, use the context to answer the user's question. If it's not possible to answer the question, ask a helpful follow-up question.\n\nContext: {context}\n\n---\n\nExpert: How can I help you today?\nUser: {question}\nExpert:",
temperature=0.10,
max_tokens=max_tokens,
top_p=0.98,
frequency_penalty=0,
presence_penalty=0,
stop=stop_sequence,
model=model,
)
output = response["choices"][0]["text"]
else:
messages = [{"role": "system", "content": f"You are a friendly expert in IPFS & Helia, use the context to answer the user's question. If it's not possible to answer the question, ask a helpful follow-up question."}]
for c in context:
messages.append({"role": "system", "name": "context", "content": c})
messages.append({"role": "user", "content": question})
response = openai.ChatCompletion.create(
messages=messages,
temperature=0.10,
max_tokens=max_tokens,
top_p=0.98,
frequency_penalty=0,
presence_penalty=0,
stop=stop_sequence,
model=model,
)
output = response["choices"][0]["message"]["content"]
global inp_tokens_used, out_tokens_used
#print (response)
inp_tokens_used += response["usage"]["prompt_tokens"]
out_tokens_used += response["usage"]["completion_tokens"]
return output.strip()
except Exception as e:
print(e)
return ""
df = None
openai.api_key = OPENAI_API_KEY
socket.setdefaulttimeout(10)
selection = 1
inp_tokens_used = 0
out_tokens_used = 0
cheap_tokens_used = 0
while selection != 0:
print("3. Ask a question (AI)")
print("2. Update AI database")
print("1. Update database")
print("0. Exit")
try:
selection = int(input("Enter a selection: "))
except:
continue
if selection == 1:
update_databases()
elif selection == 2:
db_to_aidb()
elif selection == 3:
if df is None:
df = pd.read_csv(EMBED_DB, index_col=0)
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)
df.head()
print(answer_question(input("Hello! I'm your PLN expert, how can I help you today? "), debug=False))
if type(MODELS[model]) == float:
tokens_used = inp_tokens_used + out_tokens_used
print("Total cost: $%.4f (%d tokens, %d embed tokens)" % (tokens_used / 1000 * MODELS[model] + cheap_tokens_used / 1000 * 0.0004, tokens_used, cheap_tokens_used))
else:
print("Total cost: $%.4f (%d prompt tokens, %d completion tokens, %d embed tokens)" % (inp_tokens_used / 1000 * MODELS[model][0] + out_tokens_used / 1000 * MODELS[model][1] + cheap_tokens_used / 1000 * 0.0004, inp_tokens_used, out_tokens_used, cheap_tokens_used))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment