/pdfqueryopenAIembeddings.py Secret
Created
April 11, 2023 04:01
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#base code from https://colab.research.google.com/drive/181BSOH6KF_1o2lFG8DQ6eJd2MZyiSBNt?usp=sharing#scrollTo=xlorSbccWEDa | |
#install packages | |
!pip install langchain | |
!pip install openai | |
!pip install PyPDF2 | |
!pip install faiss-cpu | |
#import packages | |
from PyPDF2 import PdfReader | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS | |
# Get your API keys from openai, you will need to create an account. | |
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview | |
import os | |
os.environ["OPENAI_API_KEY"] = "Your_API_KEY" | |
# connect your Google Drive | |
from google.colab import drive | |
drive.mount('/content/gdrive', force_remount=True) | |
root_dir = "/content/gdrive/My Drive/" | |
# location of the pdf file/files. | |
reader = PdfReader('/content/gdrive/MyDrive/HAI_AI-Index-Report_2023.pdf') | |
# read data from the file and put them into a variable called raw_text | |
raw_text = '' | |
for i, page in enumerate(reader.pages): | |
text = page.extract_text() | |
if text: | |
raw_text += text | |
raw_text[:100] | |
# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits. | |
# 1000 token size | |
# 200 overlapping chunks | |
text_splitter = CharacterTextSplitter( | |
separator = "\n", | |
chunk_size = 1000, | |
chunk_overlap = 200, | |
length_function = len, | |
) | |
texts = text_splitter.split_text(raw_text) | |
# Download embeddings from OpenAI | |
embeddings = OpenAIEmbeddings() | |
!pip install tiktoken | |
docsearch = FAISS.from_texts(texts, embeddings) | |
from langchain.chains.question_answering import load_qa_chain | |
from langchain.llms import OpenAI | |
chain = load_qa_chain(OpenAI(), chain_type="stuff") | |
query = "What are key vision models mentioned" | |
docs = docsearch.similarity_search(query) | |
chain.run(input_documents=docs, question=query) | |
query = "How is vision maturity compared to text" | |
docs = docsearch.similarity_search(query) | |
chain.run(input_documents=docs, question=query) | |
query = "What are some potential ethical issues mentioned" | |
docs = docsearch.similarity_search(query) | |
chain.run(input_documents=docs, question=query) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment