lemire/embedding.py

## embedding.py
###############
# You should basically never use this program. It is only for generating the embeddings for your ChatBot.
# If you want to run the ChatBot, see web_app.py
###############
import os
from openai.embeddings_utils import get_embedding, cosine_similarity
import pandas
import openai
import glob
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
with open("secret.txt", "r") as file:
    secret = file.read().strip()
os.environ["OPENAI_API_KEY"] = secret
openai.api_key = secret # Setting your API key

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo") # Setting your OpenAI model

gfiles = glob.glob("chatbot_docs/*") # Reading your document directory

for g1 in range(len(gfiles)): # Iterating through every document
    print(f"creating embs{g1}.csv")
    f = open(f"embs{g1}.csv", "w") # Creating a csv file for storing the embeddings for your ChatBot
    f.write("combined") # Creating the 'combined' collumn
    f.close()


    content = ""

    with open(f"{gfiles[g1]}", 'r') as file: # Storing the document contents
        print(f"reading {gfiles[g1]}")
        content += file.read()
        content +=  "\n\n"


    text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=2000, chunk_overlap=250)
    texts = text_splitter.split_text(content) # Splitting the document content into chunks
    print("texts:",len(texts))


    def get_embedding(text, model="text-embedding-ada-002"): # Defining the function that creates the embeddings needed for the Chatbot to function (It can't form answers from plain text)
        text = text.replace("\n", " ")
        print("creating embedding")
        r = openai.Embedding.create(input  = [text], model=model)['data'][0]['embedding']
        time.sleep(20)
        return r

    # Uses UTF-8 encoding by default:
    df = pandas.read_csv(f"embs{g1}.csv") # Reading the empty csv file that you created earlier for storing the embeddings
    df["combined"] = texts # Filling the 'combined' collumn with the chunks you created earlier

    for i4 in range(len(df["combined"])):
        df["combined"][i4] =  '""'  + df["combined"][i4].replace("\n", "") +  '""'  # Adding triple quotes around the text chunks to prevent syntax errors caused by double quotes in the text

    df.to_csv(f"embs{g1}.csv") # Writing the data to the csv file

    df["embedding"] = df.combined.apply(lambda  x: get_embedding(x)) # Adding and filling the 'embedding' column which contains the embeddings created from your text chunks

    df.to_csv(f"embs{g1}.csv", index=False) # Writing the new 'embedding' column to the csv file
    # Uses UTF-8 encoding by default:
    df = pandas.read_csv(f"embs{g1}.csv") # Reading the new csv file
    embs = []

    for r1 in range(len(df.embedding)): # Making the embeddings readable to the chatbot by turning them into lists
        e1 = df.embedding[r1].split(",")
        for ei2 in range(len(e1)):
            e1[ei2] = float(e1[ei2].strip().replace("[", "").replace("]", ""))
        embs.append(e1)


    df["embedding"] = embs # Updating the 'embedding' collumn

    df.to_csv(f"embs{g1}.csv", index=False) # Writing the final version of the csv file
	###############
	# You should basically never use this program. It is only for generating the embeddings for your ChatBot.
	# If you want to run the ChatBot, see web_app.py
	###############
	import os
	from openai.embeddings_utils import get_embedding, cosine_similarity
	import pandas
	import openai
	import glob
	import time
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.chat_models import ChatOpenAI
	from langchain.chains.summarize import load_summarize_chain
	with open("secret.txt", "r") as file:
	secret = file.read().strip()
	os.environ["OPENAI_API_KEY"] = secret
	openai.api_key = secret # Setting your API key

	llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo") # Setting your OpenAI model

	gfiles = glob.glob("chatbot_docs/*") # Reading your document directory

	for g1 in range(len(gfiles)): # Iterating through every document
	print(f"creating embs{g1}.csv")
	f = open(f"embs{g1}.csv", "w") # Creating a csv file for storing the embeddings for your ChatBot
	f.write("combined") # Creating the 'combined' collumn
	f.close()



	content = ""

	with open(f"{gfiles[g1]}", 'r') as file: # Storing the document contents
	print(f"reading {gfiles[g1]}")
	content += file.read()
	content += "\n\n"


	text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=2000, chunk_overlap=250)
	texts = text_splitter.split_text(content) # Splitting the document content into chunks
	print("texts:",len(texts))


	def get_embedding(text, model="text-embedding-ada-002"): # Defining the function that creates the embeddings needed for the Chatbot to function (It can't form answers from plain text)
	text = text.replace("\n", " ")
	print("creating embedding")
	r = openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
	time.sleep(20)
	return r

	# Uses UTF-8 encoding by default:
	df = pandas.read_csv(f"embs{g1}.csv") # Reading the empty csv file that you created earlier for storing the embeddings
	df["combined"] = texts # Filling the 'combined' collumn with the chunks you created earlier

	for i4 in range(len(df["combined"])):
	df["combined"][i4] = '""' + df["combined"][i4].replace("\n", "") + '""' # Adding triple quotes around the text chunks to prevent syntax errors caused by double quotes in the text

	df.to_csv(f"embs{g1}.csv") # Writing the data to the csv file

	df["embedding"] = df.combined.apply(lambda x: get_embedding(x)) # Adding and filling the 'embedding' column which contains the embeddings created from your text chunks

	df.to_csv(f"embs{g1}.csv", index=False) # Writing the new 'embedding' column to the csv file
	# Uses UTF-8 encoding by default:
	df = pandas.read_csv(f"embs{g1}.csv") # Reading the new csv file
	embs = []

	for r1 in range(len(df.embedding)): # Making the embeddings readable to the chatbot by turning them into lists
	e1 = df.embedding[r1].split(",")
	for ei2 in range(len(e1)):
	e1[ei2] = float(e1[ei2].strip().replace("[", "").replace("]", ""))
	embs.append(e1)


	df["embedding"] = embs # Updating the 'embedding' collumn

	df.to_csv(f"embs{g1}.csv", index=False) # Writing the final version of the csv file