Skip to content

Instantly share code, notes, and snippets.

@jryebread
Created February 8, 2023 06:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jryebread/3e9e66e0f131082f8a8b5cf4d8531573 to your computer and use it in GitHub Desktop.
Save jryebread/3e9e66e0f131082f8a8b5cf4d8531573 to your computer and use it in GitHub Desktop.
import pickle
from dotenv import load_dotenv
import os
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
load_dotenv()
import streamlit as st
import PyPDF2
from pathlib import Path
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
from langchain.vectorstores.faiss import FAISS
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.document_loaders import UnstructuredFileLoader
from langchain.chains import VectorDBQAWithSourcesChain
from langchain.llms import OpenAI
def remove_extra(line):
line = line.replace('\n', ' ')
line = line.replace(' ', ' ')
line = line.replace(' ', ' ')
line = line.replace('\\n', ' ')
return line
def pdfToTxt():
pages_text = []
print("Starting pdf to text transcription")
with open("file.pdf", 'rb') as pdfFileObject:
pdfReader = PyPDF2.PdfReader(pdfFileObject)
print(" No. Of Pages :", len(pdfReader.pages))
for i, page in enumerate(pdfReader.pages):
pageObject = pdfReader.pages[i]
pages_text.append(pageObject.extract_text())
if i == 544: #pages past 544 trash
break
# write txt to file
with open('file.txt', 'w', encoding="utf-8") as f:
for line in pages_text:
remove_extra(line)
f.write(line)
print("done with pdf to txt!")
# upload image to streamlit
uploaded_file = st.file_uploader("Choose a pdf file")
if uploaded_file is not None:
# To read file as bytes:
pdfBytes = uploaded_file.getvalue()
with open('file.pdf', 'wb') as handler:
handler.write(pdfBytes)
with st.spinner(text='In progress'):
pdfToTxt()
st.success("pdf uploaded!")
with open('file.txt', encoding="utf-8") as f:
data = f.read()
print("loaded data")
# Split Text to get most relevant data for LLM
text_splitter = CharacterTextSplitter(separator=" ", chunk_size=1024, chunk_overlap=0)
source_chunks = []
for chunk in text_splitter.split_text(data):
print("chunk: " + chunk)
source_chunks.append(Document(page_content=chunk))
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_KEY)
# It uses OpenAI API to create embeddings (i.e. a feature vector)
# https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
vectorStore = FAISS.from_documents(source_chunks, embeddings)
with open("vectorstore.pkl", "wb") as f:
pickle.dump(vectorStore, f)
chain = load_qa_with_sources_chain(OpenAI(temperature=0, openai_api_key=OPENAI_KEY),
chain_type="stuff")
userText = st.text_input('Ask Me Anything, Im an AI that just scanned that pdf :)')
result = chain(
{
"input_documents": vectorStore.similarity_search(userText, k=4),
"question": userText,
},
return_only_outputs=True,
)["output_text"]
st.balloons()
st.subheader(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment