Skip to content

Instantly share code, notes, and snippets.

@iankelk
Last active December 18, 2023 19:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iankelk/7777a5e53f62fd688dc825aa545ba3df to your computer and use it in GitHub Desktop.
Save iankelk/7777a5e53f62fd688dc825aa545ba3df to your computer and use it in GitHub Desktop.
RAG with Clarifai and langchain
# 1. Data Organization: chunk documents
@st.cache_resource(ttl="1h")
def load_chunk_pdf(uploaded_files):
# Read documents
documents = []
temp_dir = tempfile.TemporaryDirectory()
for file in uploaded_files:
temp_filepath = os.path.join(temp_dir.name, file.name)
with open(temp_filepath, "wb") as f:
f.write(file.getvalue())
loader = PyPDFLoader(temp_filepath)
documents.extend(loader.load())
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
chunked_documents = text_splitter.split_documents(documents)
return chunked_documents
# Create vector store on Clarifai for use in step 2
def vectorstore(USER_ID, APP_ID, docs, CLARIFAI_PAT):
clarifai_vector_db = Clarifai.from_documents(
user_id=USER_ID,
app_id=APP_ID,
documents=docs,
pat=CLARIFAI_PAT,
number_of_docs=3,
)
return clarifai_vector_db
def QandA(CLARIFAI_PAT, clarifai_vector_db):
from langchain.llms import Clarifai
USER_ID = "openai"
APP_ID = "chat-completion"
MODEL_ID = "GPT-4"
clarifai_llm = Clarifai(
pat=CLARIFAI_PAT, user_id=USER_ID, app_id=APP_ID, model_id=MODEL_ID)
qa = RetrievalQA.from_chain_type(
llm=clarifai_llm,
chain_type="stuff",
retriever=clarifai_vector_db.as_retriever()
)
return qa
def main():
user_question = st.text_input("Ask a question to GPT 3.5 Turbo model about your documents and click on get the response")
with st.sidebar:
st.subheader("Add your Clarifai PAT, USER ID, APP ID along with the documents")
CLARIFAI_PAT = st.text_input("Clarifai PAT", type="password")
USER_ID = st.text_input("Clarifai user id")
APP_ID = st.text_input("Clarifai app id")
uploaded_files = st.file_uploader(
"Upload your PDFs here", accept_multiple_files=True)
if not (CLARIFAI_PAT and USER_ID and APP_ID and uploaded_files):
st.info("Please add your Clarifai PAT, USER_ID, APP_ID and upload files to continue.")
elif st.button("Get the response"):
with st.spinner("Processing"):
docs = load_chunk_pdf(uploaded_files)
clarifai_vector_db = vectorstore(USER_ID, APP_ID, docs, CLARIFAI_PAT)
conversation = QandA(CLARIFAI_PAT, clarifai_vector_db)
response = conversation.run(user_question)
st.write(response)
if __name__ == '__main__':
main()
import streamlit as st
import tempfile
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Clarifai
from langchain.chains import RetrievalQA
from clarifai.modules.css import ClarifaiStreamlitCSS
st.set_page_config(page_title="Chat with Documents", page_icon="🦜")
st.title("🦜 RAG with Clarifai and Langchain")
ClarifaiStreamlitCSS.insert_default_css(st)
# 1. Data Organization: chunk documents
@st.cache_resource(ttl="1h")
def load_chunk_pdf(uploaded_files):
# Read documents
documents = []
temp_dir = tempfile.TemporaryDirectory()
for file in uploaded_files:
temp_filepath = os.path.join(temp_dir.name, file.name)
with open(temp_filepath, "wb") as f:
f.write(file.getvalue())
loader = PyPDFLoader(temp_filepath)
documents.extend(loader.load())
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
chunked_documents = text_splitter.split_documents(documents)
return chunked_documents
# Create vector store on Clarifai for use in step 2
def vectorstore(USER_ID, APP_ID, docs, CLARIFAI_PAT):
clarifai_vector_db = Clarifai.from_documents(
user_id=USER_ID,
app_id=APP_ID,
documents=docs,
pat=CLARIFAI_PAT,
number_of_docs=3,
)
return clarifai_vector_db
def QandA(CLARIFAI_PAT, clarifai_vector_db):
from langchain.llms import Clarifai
USER_ID = "openai"
APP_ID = "chat-completion"
MODEL_ID = "GPT-4"
# LLM to use (set to GPT-4 above)
clarifai_llm = Clarifai(
pat=CLARIFAI_PAT, user_id=USER_ID, app_id=APP_ID, model_id=MODEL_ID)
# Type of Langchain chain to use, the "stuff" chain which combines chunks retrieved
# and prepends them all to the prompt
qa = RetrievalQA.from_chain_type(
llm=clarifai_llm,
chain_type="stuff",
retriever=clarifai_vector_db.as_retriever()
)
return qa
def main():
user_question = st.text_input("Ask a question to GPT 3.5 Turbo model about your documents and click on get the response")
with st.sidebar:
st.subheader("Add your Clarifai PAT, USER ID, APP ID along with the documents")
# Get the USER_ID, APP_ID, Clarifai API Key
CLARIFAI_PAT = st.text_input("Clarifai PAT", type="password")
USER_ID = st.text_input("Clarifai user id")
APP_ID = st.text_input("Clarifai app id")
uploaded_files = st.file_uploader(
"Upload your PDFs here", accept_multiple_files=True)
if not (CLARIFAI_PAT and USER_ID and APP_ID and uploaded_files):
st.info("Please add your Clarifai PAT, USER_ID, APP_ID and upload files to continue.")
elif st.button("Get the response"):
with st.spinner("Processing"):
# process pdfs
docs = load_chunk_pdf(uploaded_files)
# create a vector store
clarifai_vector_db = vectorstore(USER_ID, APP_ID, docs, CLARIFAI_PAT)
# 2. Vector Creation: create Q&A chain
conversation = QandA(CLARIFAI_PAT, clarifai_vector_db)
# 3. Querying: Ask the question to the GPT 4 model based on the documents
# This step also combines 4. retrieval and 5. Prepending the context
response = conversation.run(user_question)
st.write(response)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment