smach/app.py

## app.py
# This code is a slight modification from the code at
# https://docs.chainlit.io/examples/qa
# that adds handling PDF and Word doc files in addition to text files

# Load all necessary libraries and modules
import chainlit as cl
from langchain.memory import ChatMessageHistory, ConversationBufferMemory
from langchain.docstore.document import Document
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.chat_models import ChatOpenAI
from langchain.chains import (
    ConversationalRetrievalChain,
)
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
import docxpy

from langchain.embeddings.openai import OpenAIEmbeddings
from typing import List
import os
import shutil
import tempfile

# Get your OpenAI API key from the .env file (there are other ways to do this)
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Create the system prompt that will be used for all user queries. Tweak as desired.
system_template = """Use the following pieces of context to answer the users question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.
The "SOURCES" part should be a reference to the source of the document from which you got your answer.

And if the user greets with greetings like Hi, hello, How are you, etc reply accordingly as well.

Example of your response should be:

The answer is foo
SOURCES: xyz


Begin!
----------------
{summaries}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]
prompt = ChatPromptTemplate.from_messages(messages)
chain_type_kwargs = {"prompt": prompt}


# This chainlit decorator tells the systemm what to do when a chat begins
@cl.on_chat_start
async def on_chat_start():
    files = None

    # Wait for the user to upload a file
    while files == None:
        files = await cl.AskFileMessage(
            content="Please upload a text, .pdf, or .docx file to begin!",
            accept=["text/plain", "application/pdf",
                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
            max_size_mb=20,
            timeout=180,
        ).send()

    file = files[0]

    msg = cl.Message(
        content=f"Processing `{file.name}`...", disable_human_feedback=True
    )
    await msg.send()

    # Decode the file if it's a .txt file
    if file.name.endswith('.txt'):
        text = file.content.decode("utf-8")

    # Save temporary copy of files that aren't .txt in order to use various Python libraries to convert to text
    def save_temp_copy(uploaded_file_path):
        # Create a temporary directory
        temp_dir = tempfile.mkdtemp()
        # Copy the uploaded file to the temporary directory
        temp_file_path = shutil.copy(uploaded_file_path, temp_dir)
        # Return the path of the temporary file
        print("The temporary file path is ")
        print(temp_file_path)
        return temp_file_path

    # Process if a PDF file
    if file.name.endswith('.pdf'):
        pdf_path = save_temp_copy(file.name)
        text = ""
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            text += page.extract_text() + "\n\n"

    # Process if a Word .docx file
    if file.name.endswith('.docx'):
        docx_path = save_temp_copy(file.name)
        text = docxpy.process(docx_path)

    # Split the text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=100)
    texts = text_splitter.split_text(text)

    # Create metadata for each text chunk
    metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]

    # Create a Chroma vector store
    embeddings = OpenAIEmbeddings()
    docsearch = await cl.make_async(Chroma.from_texts)(
        texts, embeddings, metadatas=metadatas
    )

    message_history = ChatMessageHistory()

    memory = ConversationBufferMemory(
        memory_key="chat_history",
        output_key="answer",
        chat_memory=message_history,
        return_messages=True,
    )

    # Create a chain that uses the Chroma vector store
    chain = ConversationalRetrievalChain.from_llm(
        ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True),
        chain_type="stuff",
        retriever=docsearch.as_retriever(),
        memory=memory,
        return_source_documents=True,
    )

    # Let the user know that the system is ready
    msg.content = f"Processing `{file.name}` done. You can now ask questions!"
    await msg.update()

    cl.user_session.set("chain", chain)

# This decorator tells the system what to do when it gets a message back from the API (CHECK THIS)


@cl.on_message
async def main(message):
    chain = cl.user_session.get("chain")  # type: ConversationalRetrievalChain
    cb = cl.AsyncLangchainCallbackHandler()

    res = await chain.acall(message, callbacks=[cb])
    answer = res["answer"]
    source_documents = res["source_documents"]  # type: List[Document]

    text_elements = []  # type: List[cl.Text]

    if source_documents:
        for source_idx, source_doc in enumerate(source_documents):
            source_name = f"source_{source_idx}"
            # Create the text element referenced in the message
            text_elements.append(
                cl.Text(content=source_doc.page_content, name=source_name)
            )
        source_names = [text_el.name for text_el in text_elements]

        if source_names:
            answer += f"\nSources: {', '.join(source_names)}"
        else:
            answer += "\nNo sources found"

    await cl.Message(content=answer, elements=text_elements).send()
	# This code is a slight modification from the code at
	# https://docs.chainlit.io/examples/qa
	# that adds handling PDF and Word doc files in addition to text files

	# Load all necessary libraries and modules
	import chainlit as cl
	from langchain.memory import ChatMessageHistory, ConversationBufferMemory
	from langchain.docstore.document import Document
	from langchain.prompts.chat import (
	ChatPromptTemplate,
	SystemMessagePromptTemplate,
	HumanMessagePromptTemplate,
	)
	from langchain.chat_models import ChatOpenAI
	from langchain.chains import (
	ConversationalRetrievalChain,
	)
	from langchain.vectorstores import Chroma
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from PyPDF2 import PdfReader
	import docxpy

	from langchain.embeddings.openai import OpenAIEmbeddings
	from typing import List
	import os
	import shutil
	import tempfile

	# Get your OpenAI API key from the .env file (there are other ways to do this)
	from dotenv import load_dotenv
	load_dotenv()
	OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

	# Create the system prompt that will be used for all user queries. Tweak as desired.
	system_template = """Use the following pieces of context to answer the users question.
	If you don't know the answer, just say that you don't know, don't try to make up an answer.
	ALWAYS return a "SOURCES" part in your answer.
	The "SOURCES" part should be a reference to the source of the document from which you got your answer.

	And if the user greets with greetings like Hi, hello, How are you, etc reply accordingly as well.

	Example of your response should be:

	The answer is foo
	SOURCES: xyz


	Begin!
	----------------
	{summaries}"""
	messages = [
	SystemMessagePromptTemplate.from_template(system_template),
	HumanMessagePromptTemplate.from_template("{question}"),
	]
	prompt = ChatPromptTemplate.from_messages(messages)
	chain_type_kwargs = {"prompt": prompt}


	# This chainlit decorator tells the systemm what to do when a chat begins
	@cl.on_chat_start
	async def on_chat_start():
	files = None

	# Wait for the user to upload a file
	while files == None:
	files = await cl.AskFileMessage(
	content="Please upload a text, .pdf, or .docx file to begin!",
	accept=["text/plain", "application/pdf",
	"application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
	max_size_mb=20,
	timeout=180,
	).send()

	file = files[0]

	msg = cl.Message(
	content=f"Processing `{file.name}`...", disable_human_feedback=True
	)
	await msg.send()

	# Decode the file if it's a .txt file
	if file.name.endswith('.txt'):
	text = file.content.decode("utf-8")

	# Save temporary copy of files that aren't .txt in order to use various Python libraries to convert to text
	def save_temp_copy(uploaded_file_path):
	# Create a temporary directory
	temp_dir = tempfile.mkdtemp()
	# Copy the uploaded file to the temporary directory
	temp_file_path = shutil.copy(uploaded_file_path, temp_dir)
	# Return the path of the temporary file
	print("The temporary file path is ")
	print(temp_file_path)
	return temp_file_path

	# Process if a PDF file
	if file.name.endswith('.pdf'):
	pdf_path = save_temp_copy(file.name)
	text = ""
	reader = PdfReader(pdf_path)
	for page in reader.pages:
	text += page.extract_text() + "\n\n"

	# Process if a Word .docx file
	if file.name.endswith('.docx'):
	docx_path = save_temp_copy(file.name)
	text = docxpy.process(docx_path)

	# Split the text into chunks
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000, chunk_overlap=100)
	texts = text_splitter.split_text(text)

	# Create metadata for each text chunk
	metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]

	# Create a Chroma vector store
	embeddings = OpenAIEmbeddings()
	docsearch = await cl.make_async(Chroma.from_texts)(
	texts, embeddings, metadatas=metadatas
	)

	message_history = ChatMessageHistory()

	memory = ConversationBufferMemory(
	memory_key="chat_history",
	output_key="answer",
	chat_memory=message_history,
	return_messages=True,
	)

	# Create a chain that uses the Chroma vector store
	chain = ConversationalRetrievalChain.from_llm(
	ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True),
	chain_type="stuff",
	retriever=docsearch.as_retriever(),
	memory=memory,
	return_source_documents=True,
	)

	# Let the user know that the system is ready
	msg.content = f"Processing `{file.name}` done. You can now ask questions!"
	await msg.update()

	cl.user_session.set("chain", chain)

	# This decorator tells the system what to do when it gets a message back from the API (CHECK THIS)


	@cl.on_message
	async def main(message):
	chain = cl.user_session.get("chain") # type: ConversationalRetrievalChain
	cb = cl.AsyncLangchainCallbackHandler()

	res = await chain.acall(message, callbacks=[cb])
	answer = res["answer"]
	source_documents = res["source_documents"] # type: List[Document]

	text_elements = [] # type: List[cl.Text]

	if source_documents:
	for source_idx, source_doc in enumerate(source_documents):
	source_name = f"source_{source_idx}"
	# Create the text element referenced in the message
	text_elements.append(
	cl.Text(content=source_doc.page_content, name=source_name)
	)
	source_names = [text_el.name for text_el in text_elements]

	if source_names:
	answer += f"\nSources: {', '.join(source_names)}"
	else:
	answer += "\nNo sources found"

	await cl.Message(content=answer, elements=text_elements).send()