siva2k16/pdfqueryopenAIembeddings.py Secret

## pdfqueryopenAIembeddings.py
#base code from https://colab.research.google.com/drive/181BSOH6KF_1o2lFG8DQ6eJd2MZyiSBNt?usp=sharing#scrollTo=xlorSbccWEDa
#install packages
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu

#import packages
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS

# Get your API keys from openai, you will need to create an account.
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
import os
os.environ["OPENAI_API_KEY"] = "Your_API_KEY"

# connect your Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"

# location of the pdf file/files.
reader = PdfReader('/content/gdrive/MyDrive/HAI_AI-Index-Report_2023.pdf')

# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

raw_text[:100]

# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.
# 1000 token size
# 200 overlapping chunks
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

!pip install tiktoken

docsearch = FAISS.from_texts(texts, embeddings)

from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

chain = load_qa_chain(OpenAI(), chain_type="stuff")

query = "What are key vision models mentioned"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

query = "How is vision maturity compared to text"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

query = "What are some potential ethical issues mentioned"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)
	#base code from https://colab.research.google.com/drive/181BSOH6KF_1o2lFG8DQ6eJd2MZyiSBNt?usp=sharing#scrollTo=xlorSbccWEDa
	#install packages
	!pip install langchain
	!pip install openai
	!pip install PyPDF2
	!pip install faiss-cpu

	#import packages
	from PyPDF2 import PdfReader
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS

	# Get your API keys from openai, you will need to create an account.
	# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
	import os
	os.environ["OPENAI_API_KEY"] = "Your_API_KEY"

	# connect your Google Drive
	from google.colab import drive
	drive.mount('/content/gdrive', force_remount=True)
	root_dir = "/content/gdrive/My Drive/"

	# location of the pdf file/files.
	reader = PdfReader('/content/gdrive/MyDrive/HAI_AI-Index-Report_2023.pdf')

	# read data from the file and put them into a variable called raw_text
	raw_text = ''
	for i, page in enumerate(reader.pages):
	text = page.extract_text()
	if text:
	raw_text += text

	raw_text[:100]

	# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.
	# 1000 token size
	# 200 overlapping chunks
	text_splitter = CharacterTextSplitter(
	separator = "\n",
	chunk_size = 1000,
	chunk_overlap = 200,
	length_function = len,
	)
	texts = text_splitter.split_text(raw_text)

	# Download embeddings from OpenAI
	embeddings = OpenAIEmbeddings()

	!pip install tiktoken

	docsearch = FAISS.from_texts(texts, embeddings)

	from langchain.chains.question_answering import load_qa_chain
	from langchain.llms import OpenAI

	chain = load_qa_chain(OpenAI(), chain_type="stuff")

	query = "What are key vision models mentioned"
	docs = docsearch.similarity_search(query)
	chain.run(input_documents=docs, question=query)

	query = "How is vision maturity compared to text"
	docs = docsearch.similarity_search(query)
	chain.run(input_documents=docs, question=query)

	query = "What are some potential ethical issues mentioned"
	docs = docsearch.similarity_search(query)
	chain.run(input_documents=docs, question=query)