rjurney/academic.py

## academic.py
import logging
import os

from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import Chroma

logging.getLogger("langchain").setLevel(logging.DEBUG)

# Dropbox folder with academic papers
PAPER_FOLDER = "/Users/rjurney/Dropbox/Academic Papers/"
assert os.path.exists(PAPER_FOLDER)

# Set in my ~/.zshrc
openai_api_key = os.environ.get("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY environment variable not set")

# Load all PDFs from academic paper folder
loader = PyPDFDirectoryLoader(PAPER_FOLDER, silent_errors=True)
docs = loader.load()

# How many papers on network motifs?
motif_docs = [(x.metadata["source"], x.page_content) for x in docs if "motif" in x.page_content]
motif_doc_count = len(motif_docs)
paper_count = len(set(x[0] for x in motif_docs))
print(
    f"You have {paper_count} papers on network motifs split across {motif_doc_count} document segments in `{PAPER_FOLDER}`."
)

# Embed them with OpenAI ada model and store them in ChromaDB
embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(docs, embedding=embeddings, persist_directory="data")
vectordb.persist()

# Setup a simple buffer memory system to submit with the API calls to provide prompt context
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# Create a ConversationalRetrievalChain from the LLM, the vectorstore, and the memory system
qa = ConversationalRetrievalChain.from_llm(
    OpenAI(temperature=0.8),
    vectordb.as_retriever(),
    memory=memory,
    verbose=True,
)

result = qa({"question": "What are the different types of network motif?"})

## pyproject.toml
[tool.poetry]
name = "chatbot-class"
version = "0.1.0"
description = "Course covering generative AI, large language models (LLMs), vector search, retrieval aided generation (RAG), LLM fine-tuning."
authors = ["Russell Jurney <rjurney@graphlet.ai>"]
license = "MIT"
readme = "README.md"
packages = [{include = "chatbot_class"}]

[tool.poetry.dependencies]
python = "^3.10"
openai = "^0.28.0"
chromadb = "^0.4.13"
langchain = "^0.0.301"
pypdf = "^3.16.2"
aws-cdk-lib = "^2.97.0"
tiktoken = "^0.5.1"
wandb = "^0.15.11"

[tool.poetry.group.dev.dependencies]
black = "^23.9.1"
flake8 = "^6.1.0"
isort = "^5.12.0"
mypy = "^1.5.1"
pre-commit = "^3.4.0"
ipython = "^8.15.0"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.black]
line-length = 100
target-version = ["py310"]
include = ["chatbot_class", "test"]

[tool.isort]
profile = "black"
src_paths = ["chatbot_class", "test"]

[tool.mypy]
python_version = "3.10"
mypy_path = ["chatbot_class", "test"]
warn_return_any = true
warn_unused_configs = true
warn_redundant_casts = true
warn_unused_ignores = true
	import logging
	import os

	from langchain.chains import ConversationalRetrievalChain
	from langchain.document_loaders import PyPDFDirectoryLoader
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.llms import OpenAI
	from langchain.memory import ConversationBufferMemory
	from langchain.vectorstores import Chroma

	logging.getLogger("langchain").setLevel(logging.DEBUG)

	# Dropbox folder with academic papers
	PAPER_FOLDER = "/Users/rjurney/Dropbox/Academic Papers/"
	assert os.path.exists(PAPER_FOLDER)

	# Set in my ~/.zshrc
	openai_api_key = os.environ.get("OPENAI_API_KEY")
	if not openai_api_key:
	raise ValueError("OPENAI_API_KEY environment variable not set")

	# Load all PDFs from academic paper folder
	loader = PyPDFDirectoryLoader(PAPER_FOLDER, silent_errors=True)
	docs = loader.load()

	# How many papers on network motifs?
	motif_docs = [(x.metadata["source"], x.page_content) for x in docs if "motif" in x.page_content]
	motif_doc_count = len(motif_docs)
	paper_count = len(set(x[0] for x in motif_docs))
	print(
	f"You have {paper_count} papers on network motifs split across {motif_doc_count} document segments in `{PAPER_FOLDER}`."
	)

	# Embed them with OpenAI ada model and store them in ChromaDB
	embeddings = OpenAIEmbeddings()
	vectordb = Chroma.from_documents(docs, embedding=embeddings, persist_directory="data")
	vectordb.persist()

	# Setup a simple buffer memory system to submit with the API calls to provide prompt context
	memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

	# Create a ConversationalRetrievalChain from the LLM, the vectorstore, and the memory system
	qa = ConversationalRetrievalChain.from_llm(
	OpenAI(temperature=0.8),
	vectordb.as_retriever(),
	memory=memory,
	verbose=True,
	)

	result = qa({"question": "What are the different types of network motif?"})
	[tool.poetry]
	name = "chatbot-class"
	version = "0.1.0"
	description = "Course covering generative AI, large language models (LLMs), vector search, retrieval aided generation (RAG), LLM fine-tuning."
	authors = ["Russell Jurney <rjurney@graphlet.ai>"]
	license = "MIT"
	readme = "README.md"
	packages = [{include = "chatbot_class"}]

	[tool.poetry.dependencies]
	python = "^3.10"
	openai = "^0.28.0"
	chromadb = "^0.4.13"
	langchain = "^0.0.301"
	pypdf = "^3.16.2"
	aws-cdk-lib = "^2.97.0"
	tiktoken = "^0.5.1"
	wandb = "^0.15.11"

	[tool.poetry.group.dev.dependencies]
	black = "^23.9.1"
	flake8 = "^6.1.0"
	isort = "^5.12.0"
	mypy = "^1.5.1"
	pre-commit = "^3.4.0"
	ipython = "^8.15.0"

	[build-system]
	requires = ["poetry-core"]
	build-backend = "poetry.core.masonry.api"

	[tool.black]
	line-length = 100
	target-version = ["py310"]
	include = ["chatbot_class", "test"]

	[tool.isort]
	profile = "black"
	src_paths = ["chatbot_class", "test"]

	[tool.mypy]
	python_version = "3.10"
	mypy_path = ["chatbot_class", "test"]
	warn_return_any = true
	warn_unused_configs = true
	warn_redundant_casts = true
	warn_unused_ignores = true