Skip to content

Instantly share code, notes, and snippets.

@siva2k16
Created April 11, 2023 04:01
#base code from https://colab.research.google.com/drive/181BSOH6KF_1o2lFG8DQ6eJd2MZyiSBNt?usp=sharing#scrollTo=xlorSbccWEDa
#install packages
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
#import packages
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS
# Get your API keys from openai, you will need to create an account.
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
import os
os.environ["OPENAI_API_KEY"] = "Your_API_KEY"
# connect your Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"
# location of the pdf file/files.
reader = PdfReader('/content/gdrive/MyDrive/HAI_AI-Index-Report_2023.pdf')
# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(reader.pages):
text = page.extract_text()
if text:
raw_text += text
raw_text[:100]
# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.
# 1000 token size
# 200 overlapping chunks
text_splitter = CharacterTextSplitter(
separator = "\n",
chunk_size = 1000,
chunk_overlap = 200,
length_function = len,
)
texts = text_splitter.split_text(raw_text)
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()
!pip install tiktoken
docsearch = FAISS.from_texts(texts, embeddings)
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
chain = load_qa_chain(OpenAI(), chain_type="stuff")
query = "What are key vision models mentioned"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)
query = "How is vision maturity compared to text"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)
query = "What are some potential ethical issues mentioned"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment