Skip to content

Instantly share code, notes, and snippets.

@Daethyra
Last active October 5, 2023 23:52
Show Gist options
  • Save Daethyra/3c2a1ab8bda6e326513d52a77d6b5ea7 to your computer and use it in GitHub Desktop.
Save Daethyra/3c2a1ab8bda6e326513d52a77d6b5ea7 to your computer and use it in GitHub Desktop.
ask-a-book-questions.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/Daethyra/3c2a1ab8bda6e326513d52a77d6b5ea7/ask-a-book-questions.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9d615a77",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "9d615a77",
"outputId": "97147eb9-1846-4411-c649-293732203fba"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: langchain in /usr/local/lib/python3.10/dist-packages (0.0.309)\n",
"Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n",
"Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.21)\n",
"Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.8.5)\n",
"Requirement already satisfied: anyio<4.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.7.1)\n",
"Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n",
"Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.6.1)\n",
"Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.33)\n",
"Requirement already satisfied: langsmith<0.1.0,>=0.0.40 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.0.42)\n",
"Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.23.5)\n",
"Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.10.13)\n",
"Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n",
"Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.2.3)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.1.0)\n",
"Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (3.3.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.4)\n",
"Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.2)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.0)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n",
"Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->langchain) (3.4)\n",
"Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->langchain) (1.3.0)\n",
"Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->langchain) (1.1.3)\n",
"Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (3.20.1)\n",
"Requirement already satisfied: typing-inspect<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (0.9.0)\n",
"Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain) (2.4)\n",
"Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (4.5.0)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2.0.6)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2023.7.22)\n",
"Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.0)\n",
"Requirement already satisfied: packaging>=17.0 in /usr/local/lib/python3.10/dist-packages (from marshmallow<4.0.0,>=3.18.0->dataclasses-json<0.7,>=0.5.7->langchain) (23.2)\n",
"Requirement already satisfied: mypy-extensions>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain) (1.0.0)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h/content\n",
"total 13136\n",
"drwxr-xr-x 4 root root 4096 Oct 4 13:22 .config\n",
"drwxr-xr-x 1 root root 4096 Oct 4 13:23 sample_data\n",
"drwxr-xr-x 1 root root 4096 Oct 5 23:14 ..\n",
"-rw-r--r-- 1 root root 13422975 Oct 5 23:16 field-guide-to-data-science.pdf\n",
"-rw-r--r-- 1 root root 125 Oct 5 23:18 .env\n",
"drwxr-xr-x 1 root root 4096 Oct 5 23:21 .\n",
"drwxr-xr-x 11 root root 4096 Oct 5 23:21 langchain-tutorials\n",
"fatal: destination path 'langchain-tutorials' already exists and is not an empty directory.\n"
]
}
],
"source": [
"!pip install langchain --upgrade\n",
"# Version: 0.0.164\n",
"\n",
"# Install necessary packages and upgrade outdated packages\n",
"!pip install -qU pinecone-client python-dotenv pypdf openai chromadb tiktoken\n",
"\n",
"# Install Greg's LangChain repository which contains the data/ folder to work with. Requires arrangement in Google Drive's directory.\n",
"!pwd\n",
"!ls -ltra\n",
"!git clone https://github.com/gkamradt/langchain-tutorials.git\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d3e92ed",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "2d3e92ed",
"outputId": "ee1273db-f3ca-4755-ec54-dad5463e35b8"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {},
"execution_count": 19
}
],
"source": [
"# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader\n",
"from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader\n",
"\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"import os\n",
"from dotenv import load_dotenv\n",
"\n",
"load_dotenv()"
]
},
{
"cell_type": "markdown",
"id": "5166d759",
"metadata": {
"id": "5166d759"
},
"source": [
"### Load your data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b4a2d6bf",
"metadata": {
"id": "b4a2d6bf"
},
"outputs": [],
"source": [
"loader = PyPDFLoader(\"./field-guide-to-data-science.pdf\")\n",
"\n",
"## Other options for loaders\n",
"# loader = UnstructuredPDFLoader(\"../data/field-guide-to-data-science.pdf\")\n",
"# loader = OnlinePDFLoader(\"https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bcdac23c",
"metadata": {
"id": "bcdac23c"
},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b4fd7c9e",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "b4fd7c9e",
"outputId": "04512c6b-9503-4d14-e479-d68dccf51892"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"You have 126 document(s) in your data\n",
"There are 2812 characters in your document\n"
]
}
],
"source": [
"# Note: If you're using PyPDFLoader then it will split by page for you already\n",
"print (f'You have {len(data)} document(s) in your data')\n",
"print (f'There are {len(data[30].page_content)} characters in your document')"
]
},
{
"cell_type": "markdown",
"id": "8af9b604",
"metadata": {
"id": "8af9b604"
},
"source": [
"### Chunk your data up into smaller documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fb3c6f02",
"metadata": {
"id": "fb3c6f02"
},
"outputs": [],
"source": [
"# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.\n",
"# This is optional, test out on your own data.\n",
"\n",
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)\n",
"texts = text_splitter.split_documents(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "879873a4",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "879873a4",
"outputId": "59fd6073-d29a-47fb-c738-156226ef2f73"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Now you have 162 documents\n"
]
}
],
"source": [
"print (f'Now you have {len(texts)} documents')"
]
},
{
"cell_type": "markdown",
"id": "838b2843",
"metadata": {
"id": "838b2843"
},
"source": [
"### Create embeddings of your documents to get ready for semantic search"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "373e695a",
"metadata": {
"id": "373e695a"
},
"outputs": [],
"source": [
"from langchain.vectorstores import Chroma, Pinecone\n",
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"import pinecone"
]
},
{
"cell_type": "markdown",
"id": "884e7857",
"metadata": {
"id": "884e7857"
},
"source": [
"Check to see if there is an environment variable with you API keys, if not, use what you put below"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42a1d5c3",
"metadata": {
"hide_input": false,
"id": "42a1d5c3"
},
"outputs": [],
"source": [
"OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'sk-')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b4619d3a",
"metadata": {
"id": "b4619d3a"
},
"outputs": [],
"source": [
"embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)"
]
},
{
"cell_type": "markdown",
"id": "b73d8504",
"metadata": {
"id": "b73d8504"
},
"source": [
"### Option #1: Pinecone\n",
"If you want to use pinecone, run the code below, if not then skip over to Chroma below it. You must go to [Pinecone.io](https://www.pinecone.io/) and set up an account"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e093ef3",
"metadata": {
"hide_input": false,
"id": "0e093ef3"
},
"outputs": [],
"source": [
"PINECONE_API_KEY = os.getenv('PINECONE_API_KEY', 'YourAPIKey')\n",
"PINECONE_API_ENV = os.getenv('PINECONE_API_ENV', 'us-east1-gcp') # You may need to switch with your env\n",
"\n",
"# initialize pinecone\n",
"pinecone.init(\n",
" api_key=PINECONE_API_KEY, # find at app.pinecone.io\n",
" environment=PINECONE_API_ENV # next to api key in console\n",
")\n",
"index_name = \"langchaintest\" # put in the name of your pinecone index here\n",
"\n",
"docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)"
]
},
{
"cell_type": "markdown",
"id": "76d66c06",
"metadata": {
"id": "76d66c06"
},
"source": [
"### Option #2: Chroma\n",
"\n",
"I like Chroma becauase it's local and easy to set up without an account"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4e0d1c6a",
"metadata": {
"id": "4e0d1c6a"
},
"outputs": [],
"source": [
"# load it into Chroma\n",
"docsearch = Chroma.from_documents(texts, embeddings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "34929595",
"metadata": {
"id": "34929595"
},
"outputs": [],
"source": [
"query = \"What is the top priority of a good data science team?\"\n",
"docs = docsearch.similarity_search(query)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4e0f5b45",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4e0f5b45",
"outputId": "187e38bc-2a34-4096-f41d-90b5bb9dd2a9"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"imagination should be the \n",
"hallmarks of Data Science. They \n",
"are fundamental to the success \n",
"of every Data Science project.\n"
]
}
],
"source": [
"# Here's an example of the first document that was returned\n",
"print(docs[0].page_content[:450])"
]
},
{
"cell_type": "markdown",
"id": "3c35dcd9",
"metadata": {
"id": "3c35dcd9"
},
"source": [
"### Query those docs to get your answer back"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f051337b",
"metadata": {
"id": "f051337b"
},
"outputs": [],
"source": [
"from langchain.llms import OpenAI\n",
"from langchain.chains.question_answering import load_qa_chain"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b9b1c03",
"metadata": {
"id": "6b9b1c03"
},
"outputs": [],
"source": [
"llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)\n",
"chain = load_qa_chain(llm, chain_type=\"stuff\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f67ea7c2",
"metadata": {
"id": "f67ea7c2"
},
"outputs": [],
"source": [
"query = \"What is the collect stage of data maturity?\"\n",
"docs = docsearch.similarity_search(query)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3dfd2b7d",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"id": "3dfd2b7d",
"outputId": "9e1c3694-18ca-4c2d-fb17-b6ae07645920"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"' The collect stage of data maturity focuses on collecting internal or external datasets. Gathering sales records and corresponding weather data is an example of the collect stage.'"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 46
}
],
"source": [
"chain.run(input_documents=docs, question=query)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
},
"colab": {
"provenance": [],
"include_colab_link": true
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment