Last active
July 2, 2023 22:40
-
-
Save davidmoore-io/1f9dd56bc06e2a1023f9cef3e6a5da8f to your computer and use it in GitHub Desktop.
chatdoc_v0.0.2.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#python -m pip install openai chromadb langchain unstructured | |
import os | |
# get API key from env, 'cause we're grownups | |
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY') | |
# We need things. | |
# NOTE: Unstructured loaders have many dependencies - https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/unstructured_file.html | |
from langchain import OpenAI | |
from langchain.document_loaders import UnstructuredFileLoader | |
from langchain.chains import RetrievalQA | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.vectorstores import Chroma | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.document_loaders import TextLoader | |
from langchain.prompts import PromptTemplate | |
# Load the word doc | |
# There's a decent blog post on them building the unstructructured integration - https://blog.langchain.dev/langchain-unstructured/ | |
# NOTE: The DirectoryLoader will come into play in the next iteration | |
print('Loading test doc...') | |
doc_loader = UnstructuredFileLoader("example_data/gpt-4.docx") | |
documents = doc_loader.load() | |
# Split everything up | |
text_splitter = CharacterTextSplitter(chunk_overlap=0, chunk_size=25) | |
texts = text_splitter.split_documents(documents) | |
# Do the embedding magic | |
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) | |
docsearch = Chroma.from_documents(texts, embeddings) | |
# Now we specify the model, pass it the 'stuffed' chain and the Chroma vectorstore | |
llm = OpenAI(model_name='text-davinci-003', temperature=0.8, openai_api_key=OPENAI_API_KEY) | |
# qa_chain = VectorDBQA.from_chain_type(llm=llm, chain_type='stuff', vectorstore=docsearch) - FAILS - VectorDBQA is depreciated | |
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever()) | |
query = "Who are the authors of the paper?" | |
print(qa.run(query)) | |
print("Done") | |
# qa_chain({'query': 'Whats the paper called and who wrote it?'}, return_only_outputs=True) | |
# qa_chain.run('Whats the paper called and who wrote it?') | |
# Notes for later | |
# We can it to cite its sources- this will be important in the BB use case | |
# from langchain.chains.qa_with_sources import load_qa_with_sources_chain | |
# chain = load_qa_with_sources_chain(llm, chain_type="stuff") | |
# chain({"input_documents": docs, "question": query}, return_only_outputs=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment