davidmoore-io/chatdoc_v0.0.2.py

## chatdoc_v0.0.2.py

#python -m pip install openai chromadb langchain unstructured

import os

# get API key from env, 'cause we're grownups
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

# We need things.
# NOTE: Unstructured loaders have many dependencies - https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/unstructured_file.html

from langchain import OpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.chains import RetrievalQA
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.prompts import PromptTemplate

# Load the word doc
# There's a decent blog post on them building the unstructructured integration - https://blog.langchain.dev/langchain-unstructured/
# NOTE: The DirectoryLoader will come into play in the next iteration
print('Loading test doc...')
doc_loader = UnstructuredFileLoader("example_data/gpt-4.docx")
documents = doc_loader.load()

# Split everything up
text_splitter = CharacterTextSplitter(chunk_overlap=0, chunk_size=25)
texts = text_splitter.split_documents(documents)

# Do the embedding magic
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
docsearch = Chroma.from_documents(texts, embeddings)

# Now we specify the model, pass it the 'stuffed' chain and the Chroma vectorstore
llm = OpenAI(model_name='text-davinci-003', temperature=0.8, openai_api_key=OPENAI_API_KEY)
# qa_chain = VectorDBQA.from_chain_type(llm=llm, chain_type='stuff', vectorstore=docsearch) - FAILS - VectorDBQA is depreciated
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

query = "Who are the authors of the paper?"
print(qa.run(query))

print("Done")

# qa_chain({'query': 'Whats the paper called and who wrote it?'}, return_only_outputs=True)
# qa_chain.run('Whats the paper called and who wrote it?')

# Notes for later
# We can it to cite its sources- this will be important in the BB use case
# from langchain.chains.qa_with_sources import load_qa_with_sources_chain
# chain = load_qa_with_sources_chain(llm, chain_type="stuff")
# chain({"input_documents": docs, "question": query}, return_only_outputs=True)

	#python -m pip install openai chromadb langchain unstructured

	import os

	# get API key from env, 'cause we're grownups
	OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

	# We need things.
	# NOTE: Unstructured loaders have many dependencies - https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/unstructured_file.html

	from langchain import OpenAI
	from langchain.document_loaders import UnstructuredFileLoader
	from langchain.chains import RetrievalQA
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.document_loaders import TextLoader
	from langchain.prompts import PromptTemplate

	# Load the word doc
	# There's a decent blog post on them building the unstructructured integration - https://blog.langchain.dev/langchain-unstructured/
	# NOTE: The DirectoryLoader will come into play in the next iteration
	print('Loading test doc...')
	doc_loader = UnstructuredFileLoader("example_data/gpt-4.docx")
	documents = doc_loader.load()

	# Split everything up
	text_splitter = CharacterTextSplitter(chunk_overlap=0, chunk_size=25)
	texts = text_splitter.split_documents(documents)

	# Do the embedding magic
	embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
	docsearch = Chroma.from_documents(texts, embeddings)

	# Now we specify the model, pass it the 'stuffed' chain and the Chroma vectorstore
	llm = OpenAI(model_name='text-davinci-003', temperature=0.8, openai_api_key=OPENAI_API_KEY)
	# qa_chain = VectorDBQA.from_chain_type(llm=llm, chain_type='stuff', vectorstore=docsearch) - FAILS - VectorDBQA is depreciated
	qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

	query = "Who are the authors of the paper?"
	print(qa.run(query))

	print("Done")

	# qa_chain({'query': 'Whats the paper called and who wrote it?'}, return_only_outputs=True)
	# qa_chain.run('Whats the paper called and who wrote it?')

	# Notes for later
	# We can it to cite its sources- this will be important in the BB use case
	# from langchain.chains.qa_with_sources import load_qa_with_sources_chain
	# chain = load_qa_with_sources_chain(llm, chain_type="stuff")
	# chain({"input_documents": docs, "question": query}, return_only_outputs=True)