Created
July 2, 2023 23:40
-
-
Save TheDiscordian/7cfb9d5a84e3c5fe15dc5184a567e4f6 to your computer and use it in GitHub Desktop.
PLN QA AI Bot
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request, time, json, datetime, socket, re, tiktoken, openai, os | |
import numpy as np | |
import pandas as pd | |
from openai.embeddings_utils import distances_from_embeddings | |
# Usage: OPENAI_API_KEY="key-here" python3 pln_qa.py | |
MODELS = {"text-ada-001": 0.0004, "text-babbage-001": 0.0005, "text-curie-001": 0.002, "text-davinci-001": 0.02, "gpt-3.5-turbo": (0.0015, 0.002)} | |
model = "gpt-3.5-turbo" | |
DB = 'pln_db-noembed.csv' | |
EMBED_DB = 'pln_db.csv' | |
OPENAI_API_KEY = "" | |
# Get OpenAI API Key from env var | |
if 'OPENAI_API_KEY' in os.environ: | |
OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] | |
else: | |
print("No OPENAI_API_KEY environment variable found, exiting...") | |
exit() | |
EXPERTISE = {"ipfs": [ | |
"https://raw.githubusercontent.com/ipfs/ipfs-docs/main/docs/concepts/what-is-ipfs.md", | |
"https://raw.githubusercontent.com/ipfs/ipfs-docs/main/docs/concepts/ipfs-solves.md", | |
"https://raw.githubusercontent.com/ipfs/ipfs-docs/main/docs/concepts/how-ipfs-works.md", | |
"https://raw.githubusercontent.com/ipfs/ipfs-docs/main/docs/concepts/lifecycle.md", | |
"https://raw.githubusercontent.com/ipfs/ipfs-docs/main/docs/concepts/faq.md", | |
], | |
"helia": [ | |
# General info | |
"https://raw.githubusercontent.com/wiki/ipfs/helia/Manifesto.md", | |
"https://raw.githubusercontent.com/wiki/ipfs/helia/FAQ.md", | |
"https://raw.githubusercontent.com/wiki/ipfs/helia/Branding.md", | |
"https://raw.githubusercontent.com/wiki/ipfs/helia/Meta.md", | |
# Development info | |
"https://raw.githubusercontent.com/ipfs-examples/helia-examples/main/examples/helia-101/README.md", | |
"https://raw.githubusercontent.com/ipfs-examples/helia-examples/main/examples/helia-101/101-basics.js", | |
"https://raw.githubusercontent.com/ipfs-examples/helia-examples/main/examples/helia-101/201-storage.js", | |
"https://raw.githubusercontent.com/ipfs-examples/helia-examples/main/examples/helia-101/301-networking.js", | |
"https://github.com/ipfs/helia/wiki/Migrating-from-js-IPFS.md", | |
]} | |
def update_databases(): | |
for prod in EXPERTISE: | |
print("-- Updating %s --" % prod) | |
# check if dbs/prod exists, if not, create it | |
if not os.path.exists("dbs/%s" % prod): | |
os.makedirs("dbs/%s" % prod) | |
# download all the documents contained in EXPERTISE[prod] and save them in dbs/prod | |
for url in EXPERTISE[prod]: | |
filename = url.split("/")[-1] | |
print("Downloading %s..." % filename) | |
try: | |
page = urllib.request.urlopen(url) | |
text = page.read().decode("utf-8") | |
# only modify md files | |
if filename.endswith(".md"): | |
# remove html tags | |
text = re.sub(r'<[^>]*>', '', text) | |
# remove markdown links, preserving the link name | |
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) | |
# remove all blocks which begin with ":::callout" and end with ":::" | |
text = re.sub(r':::callout[\s\S]*?:::', '', text) | |
# remove all "**", as long as it closes "**" later | |
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text) | |
# remove all square bracket enclosures | |
text = re.sub(r'\[([^\]]+)\]', r'\1', text) | |
open("dbs/%s/%s" % (prod, filename), "w").write(text.replace("\n\n\n", "\n\n")) # Does the replace here really help? | |
except Exception as e: | |
print(e) | |
time.sleep(1) | |
continue | |
time.sleep(0.25) | |
# reload_database() | |
print() | |
def db_to_aidb(): | |
ai_tech_texts = [] | |
ai_tech_files = [] | |
# list all the files in dbs, including in subdirectories, store list in ai_tech_texts | |
for root, dirs, files in os.walk("dbs"): | |
for file in files: | |
ai_tech_files.append(os.path.join(root, file)) | |
for tech_file in ai_tech_files: | |
# read the file | |
text = open(tech_file, "r").read() | |
title = "" | |
last_sub_title = "" | |
texts = [] | |
if tech_file.endswith(".md"): | |
# find out if one of the first lines is a title | |
textlines = text.split("\n") | |
last_split = 0 | |
for l in range(len(textlines)): | |
if textlines[l].startswith("# "): | |
title = textlines[l][2:] | |
if textlines[l].startswith("title: "): | |
title = textlines[l][7:] | |
# locate lines with subheadings, and break them up into more texts, stored in variable "texts" | |
if textlines[l].startswith("## "): | |
last_sub_title = title + " - " + textlines[l][3:] | |
texts.append((last_sub_title, '\n'.join(textlines[last_split:l]))) | |
last_split = l | |
if textlines[l].startswith("### "): | |
if last_sub_title == "": | |
last_sub_title = title | |
texts.append((last_sub_title + " - " + textlines[l][4:], '\n'.join(textlines[last_split:l]))) | |
last_split = l | |
if title == "": | |
# extrapolate title from filename | |
split_file = tech_file.split("/") | |
title = (split_file[-2] + " " + split_file[-1].split(".")[0].replace("-", ' ')).title() | |
if len(texts) == 0: | |
# title, text | |
ai_tech_texts.append((title, text)) # FIXME just storing text isn't good enough, break it up by sections defined in the md doc | |
else: | |
for t in texts: | |
if len(t[1].strip()) > 25: | |
ai_tech_texts.append((t[0], t[1])) | |
df = pd.DataFrame(ai_tech_texts, columns = ['title', 'text']) | |
df.head() | |
tokenizer = tiktoken.get_encoding("cl100k_base") | |
# df = pd.read_csv('processed/scraped.csv', index_col=0) | |
df.columns = ['title', 'text'] | |
global ntokens | |
ntokens = 0 | |
def get_token_count(x): | |
global ntokens | |
tokens = tokenizer.encode(x) | |
ntokens += len(tokens) | |
#if len(tokens) > 900: | |
# print("High token count...%d" % (len(tokens))) | |
return len(tokens) | |
df['n_tokens'] = df.text.apply(get_token_count) | |
print("Tokens used: %d ($%.2f to process)" % (ntokens, ntokens / 1000 * 0.0004)) | |
inp = input("Continue? (y/N) ") | |
if inp.lower() != "y": | |
df.to_csv(DB, index=False, encoding='utf-8') | |
return | |
print("... Processing embeds do NOT stop this process for any reason! ...") | |
socket.setdefaulttimeout(300) | |
global count | |
count = 0 | |
def process_embeds(x): | |
global count | |
count += 1 | |
print("\rProcessing embeds... %.2f%%" % (count / len(df) * 100), end="") | |
return openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'] | |
df['embeddings'] = df.text.apply(process_embeds) | |
socket.setdefaulttimeout(10) | |
df.to_csv(EMBED_DB, index=False, encoding='utf-8') | |
df.head() | |
def create_context(question, max_len=1100, max_count=6, size="ada"): | |
""" | |
Create a context for a question by finding the most similar context from the dataframe | |
""" | |
global df, cheap_tokens_used | |
# Get the embeddings for the question | |
emb = openai.Embedding.create(input=question, engine='text-embedding-ada-002') | |
q_embeddings = emb['data'][0]['embedding'] | |
cheap_tokens_used += emb["usage"]["total_tokens"] | |
# Get the distances from the embeddings | |
df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine') | |
returns = [] | |
cur_len = 0 | |
count = 0 | |
# Sort by distance and add the text to the context until the context is too long | |
for i, row in df.sort_values('distances', ascending=True).iterrows(): | |
count += 1; | |
if count > max_count: | |
break | |
# Add the length of the text to the current length | |
cur_len += row['n_tokens'] + 4 | |
# If the context is too long, break | |
if cur_len > max_len: | |
break | |
# Else add it to the text that is being returned | |
#returns.append("Name: %s\nDescription: %s" % (i, row["text"])) | |
returns.append(row["text"]) | |
# Return the context | |
return returns | |
def answer_question(question, | |
model=model, | |
max_len=1900, | |
max_count=3, | |
size="ada", | |
debug=False, | |
max_tokens=1100, | |
stop_sequence=["\nExpert:", "\nUser:"] | |
): | |
""" | |
Answer a question based on the most similar context from the dataframe texts | |
""" | |
context = create_context( | |
question, | |
max_len=max_len, | |
size=size, | |
max_count=max_count, | |
) | |
try: | |
# Create a completions using the question and context | |
response = None | |
output = "" | |
q_lower = question.lower() | |
if "ipfs" in q_lower: | |
# This myth gets into the AI's head very easily, it's best to keep this line to explicitly prevent misinformation | |
context.append("IPFS doesn't distribute or replicate data across the network unless another node explicitly chooses to store the data. When you add a file to your IPFS node, it's only stored on your IPFS node until another node explicity requests the data.") | |
# If debug, print the raw model response | |
if debug: | |
print("Context:\n" + "\n###\n\n".join(context)) | |
print("\n\n") | |
if type(MODELS[model]) == float: | |
context = "\n###\n\n".join(context) | |
response = openai.Completion.create( | |
prompt=f"You are a friendly expert in IPFS & Helia, use the context to answer the user's question. If it's not possible to answer the question, ask a helpful follow-up question.\n\nContext: {context}\n\n---\n\nExpert: How can I help you today?\nUser: {question}\nExpert:", | |
temperature=0.10, | |
max_tokens=max_tokens, | |
top_p=0.98, | |
frequency_penalty=0, | |
presence_penalty=0, | |
stop=stop_sequence, | |
model=model, | |
) | |
output = response["choices"][0]["text"] | |
else: | |
messages = [{"role": "system", "content": f"You are a friendly expert in IPFS & Helia, use the context to answer the user's question. If it's not possible to answer the question, ask a helpful follow-up question."}] | |
for c in context: | |
messages.append({"role": "system", "name": "context", "content": c}) | |
messages.append({"role": "user", "content": question}) | |
response = openai.ChatCompletion.create( | |
messages=messages, | |
temperature=0.10, | |
max_tokens=max_tokens, | |
top_p=0.98, | |
frequency_penalty=0, | |
presence_penalty=0, | |
stop=stop_sequence, | |
model=model, | |
) | |
output = response["choices"][0]["message"]["content"] | |
global inp_tokens_used, out_tokens_used | |
#print (response) | |
inp_tokens_used += response["usage"]["prompt_tokens"] | |
out_tokens_used += response["usage"]["completion_tokens"] | |
return output.strip() | |
except Exception as e: | |
print(e) | |
return "" | |
df = None | |
openai.api_key = OPENAI_API_KEY | |
socket.setdefaulttimeout(10) | |
selection = 1 | |
inp_tokens_used = 0 | |
out_tokens_used = 0 | |
cheap_tokens_used = 0 | |
while selection != 0: | |
print("3. Ask a question (AI)") | |
print("2. Update AI database") | |
print("1. Update database") | |
print("0. Exit") | |
try: | |
selection = int(input("Enter a selection: ")) | |
except: | |
continue | |
if selection == 1: | |
update_databases() | |
elif selection == 2: | |
db_to_aidb() | |
elif selection == 3: | |
if df is None: | |
df = pd.read_csv(EMBED_DB, index_col=0) | |
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array) | |
df.head() | |
print(answer_question(input("Hello! I'm your PLN expert, how can I help you today? "), debug=False)) | |
if type(MODELS[model]) == float: | |
tokens_used = inp_tokens_used + out_tokens_used | |
print("Total cost: $%.4f (%d tokens, %d embed tokens)" % (tokens_used / 1000 * MODELS[model] + cheap_tokens_used / 1000 * 0.0004, tokens_used, cheap_tokens_used)) | |
else: | |
print("Total cost: $%.4f (%d prompt tokens, %d completion tokens, %d embed tokens)" % (inp_tokens_used / 1000 * MODELS[model][0] + out_tokens_used / 1000 * MODELS[model][1] + cheap_tokens_used / 1000 * 0.0004, inp_tokens_used, out_tokens_used, cheap_tokens_used)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment