Skip to content

Instantly share code, notes, and snippets.

@davidmoore-io
Last active July 2, 2023 22:40
Show Gist options
  • Save davidmoore-io/1f9dd56bc06e2a1023f9cef3e6a5da8f to your computer and use it in GitHub Desktop.
Save davidmoore-io/1f9dd56bc06e2a1023f9cef3e6a5da8f to your computer and use it in GitHub Desktop.
chatdoc_v0.0.2.py
#python -m pip install openai chromadb langchain unstructured
import os
# get API key from env, 'cause we're grownups
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
# We need things.
# NOTE: Unstructured loaders have many dependencies - https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/unstructured_file.html
from langchain import OpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.chains import RetrievalQA
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.prompts import PromptTemplate
# Load the word doc
# There's a decent blog post on them building the unstructructured integration - https://blog.langchain.dev/langchain-unstructured/
# NOTE: The DirectoryLoader will come into play in the next iteration
print('Loading test doc...')
doc_loader = UnstructuredFileLoader("example_data/gpt-4.docx")
documents = doc_loader.load()
# Split everything up
text_splitter = CharacterTextSplitter(chunk_overlap=0, chunk_size=25)
texts = text_splitter.split_documents(documents)
# Do the embedding magic
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
docsearch = Chroma.from_documents(texts, embeddings)
# Now we specify the model, pass it the 'stuffed' chain and the Chroma vectorstore
llm = OpenAI(model_name='text-davinci-003', temperature=0.8, openai_api_key=OPENAI_API_KEY)
# qa_chain = VectorDBQA.from_chain_type(llm=llm, chain_type='stuff', vectorstore=docsearch) - FAILS - VectorDBQA is depreciated
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())
query = "Who are the authors of the paper?"
print(qa.run(query))
print("Done")
# qa_chain({'query': 'Whats the paper called and who wrote it?'}, return_only_outputs=True)
# qa_chain.run('Whats the paper called and who wrote it?')
# Notes for later
# We can it to cite its sources- this will be important in the BB use case
# from langchain.chains.qa_with_sources import load_qa_with_sources_chain
# chain = load_qa_with_sources_chain(llm, chain_type="stuff")
# chain({"input_documents": docs, "question": query}, return_only_outputs=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment