Skip to content

Instantly share code, notes, and snippets.

@rjurney
Last active September 26, 2023 13:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rjurney/70fbedd332c89410d3dca4f9aff78954 to your computer and use it in GitHub Desktop.
Save rjurney/70fbedd332c89410d3dca4f9aff78954 to your computer and use it in GitHub Desktop.
Q&A on all your academic papers…
import logging
import os
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import Chroma
logging.getLogger("langchain").setLevel(logging.DEBUG)
# Dropbox folder with academic papers
PAPER_FOLDER = "/Users/rjurney/Dropbox/Academic Papers/"
assert os.path.exists(PAPER_FOLDER)
# Set in my ~/.zshrc
openai_api_key = os.environ.get("OPENAI_API_KEY")
if not openai_api_key:
raise ValueError("OPENAI_API_KEY environment variable not set")
# Load all PDFs from academic paper folder
loader = PyPDFDirectoryLoader(PAPER_FOLDER, silent_errors=True)
docs = loader.load()
# How many papers on network motifs?
motif_docs = [(x.metadata["source"], x.page_content) for x in docs if "motif" in x.page_content]
motif_doc_count = len(motif_docs)
paper_count = len(set(x[0] for x in motif_docs))
print(
f"You have {paper_count} papers on network motifs split across {motif_doc_count} document segments in `{PAPER_FOLDER}`."
)
# Embed them with OpenAI ada model and store them in ChromaDB
embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(docs, embedding=embeddings, persist_directory="data")
vectordb.persist()
# Setup a simple buffer memory system to submit with the API calls to provide prompt context
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# Create a ConversationalRetrievalChain from the LLM, the vectorstore, and the memory system
qa = ConversationalRetrievalChain.from_llm(
OpenAI(temperature=0.8),
vectordb.as_retriever(),
memory=memory,
verbose=True,
)
result = qa({"question": "What are the different types of network motif?"})
[tool.poetry]
name = "chatbot-class"
version = "0.1.0"
description = "Course covering generative AI, large language models (LLMs), vector search, retrieval aided generation (RAG), LLM fine-tuning."
authors = ["Russell Jurney <rjurney@graphlet.ai>"]
license = "MIT"
readme = "README.md"
packages = [{include = "chatbot_class"}]
[tool.poetry.dependencies]
python = "^3.10"
openai = "^0.28.0"
chromadb = "^0.4.13"
langchain = "^0.0.301"
pypdf = "^3.16.2"
aws-cdk-lib = "^2.97.0"
tiktoken = "^0.5.1"
wandb = "^0.15.11"
[tool.poetry.group.dev.dependencies]
black = "^23.9.1"
flake8 = "^6.1.0"
isort = "^5.12.0"
mypy = "^1.5.1"
pre-commit = "^3.4.0"
ipython = "^8.15.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.black]
line-length = 100
target-version = ["py310"]
include = ["chatbot_class", "test"]
[tool.isort]
profile = "black"
src_paths = ["chatbot_class", "test"]
[tool.mypy]
python_version = "3.10"
mypy_path = ["chatbot_class", "test"]
warn_return_any = true
warn_unused_configs = true
warn_redundant_casts = true
warn_unused_ignores = true
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment