Last active
September 26, 2023 13:04
-
-
Save rjurney/70fbedd332c89410d3dca4f9aff78954 to your computer and use it in GitHub Desktop.
Q&A on all your academic papers…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import os | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.document_loaders import PyPDFDirectoryLoader | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.llms import OpenAI | |
from langchain.memory import ConversationBufferMemory | |
from langchain.vectorstores import Chroma | |
logging.getLogger("langchain").setLevel(logging.DEBUG) | |
# Dropbox folder with academic papers | |
PAPER_FOLDER = "/Users/rjurney/Dropbox/Academic Papers/" | |
assert os.path.exists(PAPER_FOLDER) | |
# Set in my ~/.zshrc | |
openai_api_key = os.environ.get("OPENAI_API_KEY") | |
if not openai_api_key: | |
raise ValueError("OPENAI_API_KEY environment variable not set") | |
# Load all PDFs from academic paper folder | |
loader = PyPDFDirectoryLoader(PAPER_FOLDER, silent_errors=True) | |
docs = loader.load() | |
# How many papers on network motifs? | |
motif_docs = [(x.metadata["source"], x.page_content) for x in docs if "motif" in x.page_content] | |
motif_doc_count = len(motif_docs) | |
paper_count = len(set(x[0] for x in motif_docs)) | |
print( | |
f"You have {paper_count} papers on network motifs split across {motif_doc_count} document segments in `{PAPER_FOLDER}`." | |
) | |
# Embed them with OpenAI ada model and store them in ChromaDB | |
embeddings = OpenAIEmbeddings() | |
vectordb = Chroma.from_documents(docs, embedding=embeddings, persist_directory="data") | |
vectordb.persist() | |
# Setup a simple buffer memory system to submit with the API calls to provide prompt context | |
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) | |
# Create a ConversationalRetrievalChain from the LLM, the vectorstore, and the memory system | |
qa = ConversationalRetrievalChain.from_llm( | |
OpenAI(temperature=0.8), | |
vectordb.as_retriever(), | |
memory=memory, | |
verbose=True, | |
) | |
result = qa({"question": "What are the different types of network motif?"}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[tool.poetry] | |
name = "chatbot-class" | |
version = "0.1.0" | |
description = "Course covering generative AI, large language models (LLMs), vector search, retrieval aided generation (RAG), LLM fine-tuning." | |
authors = ["Russell Jurney <rjurney@graphlet.ai>"] | |
license = "MIT" | |
readme = "README.md" | |
packages = [{include = "chatbot_class"}] | |
[tool.poetry.dependencies] | |
python = "^3.10" | |
openai = "^0.28.0" | |
chromadb = "^0.4.13" | |
langchain = "^0.0.301" | |
pypdf = "^3.16.2" | |
aws-cdk-lib = "^2.97.0" | |
tiktoken = "^0.5.1" | |
wandb = "^0.15.11" | |
[tool.poetry.group.dev.dependencies] | |
black = "^23.9.1" | |
flake8 = "^6.1.0" | |
isort = "^5.12.0" | |
mypy = "^1.5.1" | |
pre-commit = "^3.4.0" | |
ipython = "^8.15.0" | |
[build-system] | |
requires = ["poetry-core"] | |
build-backend = "poetry.core.masonry.api" | |
[tool.black] | |
line-length = 100 | |
target-version = ["py310"] | |
include = ["chatbot_class", "test"] | |
[tool.isort] | |
profile = "black" | |
src_paths = ["chatbot_class", "test"] | |
[tool.mypy] | |
python_version = "3.10" | |
mypy_path = ["chatbot_class", "test"] | |
warn_return_any = true | |
warn_unused_configs = true | |
warn_redundant_casts = true | |
warn_unused_ignores = true |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment