Last active
December 18, 2023 19:51
-
-
Save iankelk/7777a5e53f62fd688dc825aa545ba3df to your computer and use it in GitHub Desktop.
RAG with Clarifai and langchain
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1. Data Organization: chunk documents | |
@st.cache_resource(ttl="1h") | |
def load_chunk_pdf(uploaded_files): | |
# Read documents | |
documents = [] | |
temp_dir = tempfile.TemporaryDirectory() | |
for file in uploaded_files: | |
temp_filepath = os.path.join(temp_dir.name, file.name) | |
with open(temp_filepath, "wb") as f: | |
f.write(file.getvalue()) | |
loader = PyPDFLoader(temp_filepath) | |
documents.extend(loader.load()) | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
chunked_documents = text_splitter.split_documents(documents) | |
return chunked_documents |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create vector store on Clarifai for use in step 2 | |
def vectorstore(USER_ID, APP_ID, docs, CLARIFAI_PAT): | |
clarifai_vector_db = Clarifai.from_documents( | |
user_id=USER_ID, | |
app_id=APP_ID, | |
documents=docs, | |
pat=CLARIFAI_PAT, | |
number_of_docs=3, | |
) | |
return clarifai_vector_db |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def QandA(CLARIFAI_PAT, clarifai_vector_db): | |
from langchain.llms import Clarifai | |
USER_ID = "openai" | |
APP_ID = "chat-completion" | |
MODEL_ID = "GPT-4" | |
clarifai_llm = Clarifai( | |
pat=CLARIFAI_PAT, user_id=USER_ID, app_id=APP_ID, model_id=MODEL_ID) | |
qa = RetrievalQA.from_chain_type( | |
llm=clarifai_llm, | |
chain_type="stuff", | |
retriever=clarifai_vector_db.as_retriever() | |
) | |
return qa |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def main(): | |
user_question = st.text_input("Ask a question to GPT 3.5 Turbo model about your documents and click on get the response") | |
with st.sidebar: | |
st.subheader("Add your Clarifai PAT, USER ID, APP ID along with the documents") | |
CLARIFAI_PAT = st.text_input("Clarifai PAT", type="password") | |
USER_ID = st.text_input("Clarifai user id") | |
APP_ID = st.text_input("Clarifai app id") | |
uploaded_files = st.file_uploader( | |
"Upload your PDFs here", accept_multiple_files=True) | |
if not (CLARIFAI_PAT and USER_ID and APP_ID and uploaded_files): | |
st.info("Please add your Clarifai PAT, USER_ID, APP_ID and upload files to continue.") | |
elif st.button("Get the response"): | |
with st.spinner("Processing"): | |
docs = load_chunk_pdf(uploaded_files) | |
clarifai_vector_db = vectorstore(USER_ID, APP_ID, docs, CLARIFAI_PAT) | |
conversation = QandA(CLARIFAI_PAT, clarifai_vector_db) | |
response = conversation.run(user_question) | |
st.write(response) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import streamlit as st | |
import tempfile | |
import os | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.vectorstores import Clarifai | |
from langchain.chains import RetrievalQA | |
from clarifai.modules.css import ClarifaiStreamlitCSS | |
st.set_page_config(page_title="Chat with Documents", page_icon="🦜") | |
st.title("🦜 RAG with Clarifai and Langchain") | |
ClarifaiStreamlitCSS.insert_default_css(st) | |
# 1. Data Organization: chunk documents | |
@st.cache_resource(ttl="1h") | |
def load_chunk_pdf(uploaded_files): | |
# Read documents | |
documents = [] | |
temp_dir = tempfile.TemporaryDirectory() | |
for file in uploaded_files: | |
temp_filepath = os.path.join(temp_dir.name, file.name) | |
with open(temp_filepath, "wb") as f: | |
f.write(file.getvalue()) | |
loader = PyPDFLoader(temp_filepath) | |
documents.extend(loader.load()) | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
chunked_documents = text_splitter.split_documents(documents) | |
return chunked_documents | |
# Create vector store on Clarifai for use in step 2 | |
def vectorstore(USER_ID, APP_ID, docs, CLARIFAI_PAT): | |
clarifai_vector_db = Clarifai.from_documents( | |
user_id=USER_ID, | |
app_id=APP_ID, | |
documents=docs, | |
pat=CLARIFAI_PAT, | |
number_of_docs=3, | |
) | |
return clarifai_vector_db | |
def QandA(CLARIFAI_PAT, clarifai_vector_db): | |
from langchain.llms import Clarifai | |
USER_ID = "openai" | |
APP_ID = "chat-completion" | |
MODEL_ID = "GPT-4" | |
# LLM to use (set to GPT-4 above) | |
clarifai_llm = Clarifai( | |
pat=CLARIFAI_PAT, user_id=USER_ID, app_id=APP_ID, model_id=MODEL_ID) | |
# Type of Langchain chain to use, the "stuff" chain which combines chunks retrieved | |
# and prepends them all to the prompt | |
qa = RetrievalQA.from_chain_type( | |
llm=clarifai_llm, | |
chain_type="stuff", | |
retriever=clarifai_vector_db.as_retriever() | |
) | |
return qa | |
def main(): | |
user_question = st.text_input("Ask a question to GPT 3.5 Turbo model about your documents and click on get the response") | |
with st.sidebar: | |
st.subheader("Add your Clarifai PAT, USER ID, APP ID along with the documents") | |
# Get the USER_ID, APP_ID, Clarifai API Key | |
CLARIFAI_PAT = st.text_input("Clarifai PAT", type="password") | |
USER_ID = st.text_input("Clarifai user id") | |
APP_ID = st.text_input("Clarifai app id") | |
uploaded_files = st.file_uploader( | |
"Upload your PDFs here", accept_multiple_files=True) | |
if not (CLARIFAI_PAT and USER_ID and APP_ID and uploaded_files): | |
st.info("Please add your Clarifai PAT, USER_ID, APP_ID and upload files to continue.") | |
elif st.button("Get the response"): | |
with st.spinner("Processing"): | |
# process pdfs | |
docs = load_chunk_pdf(uploaded_files) | |
# create a vector store | |
clarifai_vector_db = vectorstore(USER_ID, APP_ID, docs, CLARIFAI_PAT) | |
# 2. Vector Creation: create Q&A chain | |
conversation = QandA(CLARIFAI_PAT, clarifai_vector_db) | |
# 3. Querying: Ask the question to the GPT 4 model based on the documents | |
# This step also combines 4. retrieval and 5. Prepending the context | |
response = conversation.run(user_question) | |
st.write(response) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment