Skip to content

Instantly share code, notes, and snippets.

@lemire
Created December 6, 2023 03:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lemire/9d1aba4b0aba53926bf4f7269dfff2e7 to your computer and use it in GitHub Desktop.
Save lemire/9d1aba4b0aba53926bf4f7269dfff2e7 to your computer and use it in GitHub Desktop.
generate embeddings
###############
# You should basically never use this program. It is only for generating the embeddings for your ChatBot.
# If you want to run the ChatBot, see web_app.py
###############
import os
from openai.embeddings_utils import get_embedding, cosine_similarity
import pandas
import openai
import glob
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
with open("secret.txt", "r") as file:
secret = file.read().strip()
os.environ["OPENAI_API_KEY"] = secret
openai.api_key = secret # Setting your API key
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo") # Setting your OpenAI model
gfiles = glob.glob("chatbot_docs/*") # Reading your document directory
for g1 in range(len(gfiles)): # Iterating through every document
print(f"creating embs{g1}.csv")
f = open(f"embs{g1}.csv", "w") # Creating a csv file for storing the embeddings for your ChatBot
f.write("combined") # Creating the 'combined' collumn
f.close()
content = ""
with open(f"{gfiles[g1]}", 'r') as file: # Storing the document contents
print(f"reading {gfiles[g1]}")
content += file.read()
content += "\n\n"
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=2000, chunk_overlap=250)
texts = text_splitter.split_text(content) # Splitting the document content into chunks
print("texts:",len(texts))
def get_embedding(text, model="text-embedding-ada-002"): # Defining the function that creates the embeddings needed for the Chatbot to function (It can't form answers from plain text)
text = text.replace("\n", " ")
print("creating embedding")
r = openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
time.sleep(20)
return r
# Uses UTF-8 encoding by default:
df = pandas.read_csv(f"embs{g1}.csv") # Reading the empty csv file that you created earlier for storing the embeddings
df["combined"] = texts # Filling the 'combined' collumn with the chunks you created earlier
for i4 in range(len(df["combined"])):
df["combined"][i4] = '""' + df["combined"][i4].replace("\n", "") + '""' # Adding triple quotes around the text chunks to prevent syntax errors caused by double quotes in the text
df.to_csv(f"embs{g1}.csv") # Writing the data to the csv file
df["embedding"] = df.combined.apply(lambda x: get_embedding(x)) # Adding and filling the 'embedding' column which contains the embeddings created from your text chunks
df.to_csv(f"embs{g1}.csv", index=False) # Writing the new 'embedding' column to the csv file
# Uses UTF-8 encoding by default:
df = pandas.read_csv(f"embs{g1}.csv") # Reading the new csv file
embs = []
for r1 in range(len(df.embedding)): # Making the embeddings readable to the chatbot by turning them into lists
e1 = df.embedding[r1].split(",")
for ei2 in range(len(e1)):
e1[ei2] = float(e1[ei2].strip().replace("[", "").replace("]", ""))
embs.append(e1)
df["embedding"] = embs # Updating the 'embedding' collumn
df.to_csv(f"embs{g1}.csv", index=False) # Writing the final version of the csv file
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment