Last active
October 5, 2023 23:52
-
-
Save Daethyra/3c2a1ab8bda6e326513d52a77d6b5ea7 to your computer and use it in GitHub Desktop.
ask-a-book-questions.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/Daethyra/3c2a1ab8bda6e326513d52a77d6b5ea7/ask-a-book-questions.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "9d615a77", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "9d615a77", | |
"outputId": "97147eb9-1846-4411-c649-293732203fba" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Requirement already satisfied: langchain in /usr/local/lib/python3.10/dist-packages (0.0.309)\n", | |
"Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n", | |
"Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.21)\n", | |
"Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.8.5)\n", | |
"Requirement already satisfied: anyio<4.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.7.1)\n", | |
"Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n", | |
"Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.6.1)\n", | |
"Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.33)\n", | |
"Requirement already satisfied: langsmith<0.1.0,>=0.0.40 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.0.42)\n", | |
"Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.23.5)\n", | |
"Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.10.13)\n", | |
"Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n", | |
"Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.2.3)\n", | |
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.1.0)\n", | |
"Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (3.3.0)\n", | |
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.4)\n", | |
"Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.2)\n", | |
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.0)\n", | |
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n", | |
"Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->langchain) (3.4)\n", | |
"Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->langchain) (1.3.0)\n", | |
"Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->langchain) (1.1.3)\n", | |
"Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (3.20.1)\n", | |
"Requirement already satisfied: typing-inspect<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (0.9.0)\n", | |
"Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain) (2.4)\n", | |
"Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (4.5.0)\n", | |
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2.0.6)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2023.7.22)\n", | |
"Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.0)\n", | |
"Requirement already satisfied: packaging>=17.0 in /usr/local/lib/python3.10/dist-packages (from marshmallow<4.0.0,>=3.18.0->dataclasses-json<0.7,>=0.5.7->langchain) (23.2)\n", | |
"Requirement already satisfied: mypy-extensions>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain) (1.0.0)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25h/content\n", | |
"total 13136\n", | |
"drwxr-xr-x 4 root root 4096 Oct 4 13:22 .config\n", | |
"drwxr-xr-x 1 root root 4096 Oct 4 13:23 sample_data\n", | |
"drwxr-xr-x 1 root root 4096 Oct 5 23:14 ..\n", | |
"-rw-r--r-- 1 root root 13422975 Oct 5 23:16 field-guide-to-data-science.pdf\n", | |
"-rw-r--r-- 1 root root 125 Oct 5 23:18 .env\n", | |
"drwxr-xr-x 1 root root 4096 Oct 5 23:21 .\n", | |
"drwxr-xr-x 11 root root 4096 Oct 5 23:21 langchain-tutorials\n", | |
"fatal: destination path 'langchain-tutorials' already exists and is not an empty directory.\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install langchain --upgrade\n", | |
"# Version: 0.0.164\n", | |
"\n", | |
"# Install necessary packages and upgrade outdated packages\n", | |
"!pip install -qU pinecone-client python-dotenv pypdf openai chromadb tiktoken\n", | |
"\n", | |
"# Install Greg's LangChain repository which contains the data/ folder to work with. Requires arrangement in Google Drive's directory.\n", | |
"!pwd\n", | |
"!ls -ltra\n", | |
"!git clone https://github.com/gkamradt/langchain-tutorials.git\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "2d3e92ed", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "2d3e92ed", | |
"outputId": "ee1273db-f3ca-4755-ec54-dad5463e35b8" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 19 | |
} | |
], | |
"source": [ | |
"# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader\n", | |
"from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader\n", | |
"\n", | |
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n", | |
"import os\n", | |
"from dotenv import load_dotenv\n", | |
"\n", | |
"load_dotenv()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "5166d759", | |
"metadata": { | |
"id": "5166d759" | |
}, | |
"source": [ | |
"### Load your data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "b4a2d6bf", | |
"metadata": { | |
"id": "b4a2d6bf" | |
}, | |
"outputs": [], | |
"source": [ | |
"loader = PyPDFLoader(\"./field-guide-to-data-science.pdf\")\n", | |
"\n", | |
"## Other options for loaders\n", | |
"# loader = UnstructuredPDFLoader(\"../data/field-guide-to-data-science.pdf\")\n", | |
"# loader = OnlinePDFLoader(\"https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "bcdac23c", | |
"metadata": { | |
"id": "bcdac23c" | |
}, | |
"outputs": [], | |
"source": [ | |
"data = loader.load()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "b4fd7c9e", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "b4fd7c9e", | |
"outputId": "04512c6b-9503-4d14-e479-d68dccf51892" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"You have 126 document(s) in your data\n", | |
"There are 2812 characters in your document\n" | |
] | |
} | |
], | |
"source": [ | |
"# Note: If you're using PyPDFLoader then it will split by page for you already\n", | |
"print (f'You have {len(data)} document(s) in your data')\n", | |
"print (f'There are {len(data[30].page_content)} characters in your document')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "8af9b604", | |
"metadata": { | |
"id": "8af9b604" | |
}, | |
"source": [ | |
"### Chunk your data up into smaller documents" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "fb3c6f02", | |
"metadata": { | |
"id": "fb3c6f02" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.\n", | |
"# This is optional, test out on your own data.\n", | |
"\n", | |
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)\n", | |
"texts = text_splitter.split_documents(data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "879873a4", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "879873a4", | |
"outputId": "59fd6073-d29a-47fb-c738-156226ef2f73" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Now you have 162 documents\n" | |
] | |
} | |
], | |
"source": [ | |
"print (f'Now you have {len(texts)} documents')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "838b2843", | |
"metadata": { | |
"id": "838b2843" | |
}, | |
"source": [ | |
"### Create embeddings of your documents to get ready for semantic search" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "373e695a", | |
"metadata": { | |
"id": "373e695a" | |
}, | |
"outputs": [], | |
"source": [ | |
"from langchain.vectorstores import Chroma, Pinecone\n", | |
"from langchain.embeddings.openai import OpenAIEmbeddings\n", | |
"import pinecone" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "884e7857", | |
"metadata": { | |
"id": "884e7857" | |
}, | |
"source": [ | |
"Check to see if there is an environment variable with you API keys, if not, use what you put below" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "42a1d5c3", | |
"metadata": { | |
"hide_input": false, | |
"id": "42a1d5c3" | |
}, | |
"outputs": [], | |
"source": [ | |
"OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'sk-')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "b4619d3a", | |
"metadata": { | |
"id": "b4619d3a" | |
}, | |
"outputs": [], | |
"source": [ | |
"embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "b73d8504", | |
"metadata": { | |
"id": "b73d8504" | |
}, | |
"source": [ | |
"### Option #1: Pinecone\n", | |
"If you want to use pinecone, run the code below, if not then skip over to Chroma below it. You must go to [Pinecone.io](https://www.pinecone.io/) and set up an account" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "0e093ef3", | |
"metadata": { | |
"hide_input": false, | |
"id": "0e093ef3" | |
}, | |
"outputs": [], | |
"source": [ | |
"PINECONE_API_KEY = os.getenv('PINECONE_API_KEY', 'YourAPIKey')\n", | |
"PINECONE_API_ENV = os.getenv('PINECONE_API_ENV', 'us-east1-gcp') # You may need to switch with your env\n", | |
"\n", | |
"# initialize pinecone\n", | |
"pinecone.init(\n", | |
" api_key=PINECONE_API_KEY, # find at app.pinecone.io\n", | |
" environment=PINECONE_API_ENV # next to api key in console\n", | |
")\n", | |
"index_name = \"langchaintest\" # put in the name of your pinecone index here\n", | |
"\n", | |
"docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "76d66c06", | |
"metadata": { | |
"id": "76d66c06" | |
}, | |
"source": [ | |
"### Option #2: Chroma\n", | |
"\n", | |
"I like Chroma becauase it's local and easy to set up without an account" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "4e0d1c6a", | |
"metadata": { | |
"id": "4e0d1c6a" | |
}, | |
"outputs": [], | |
"source": [ | |
"# load it into Chroma\n", | |
"docsearch = Chroma.from_documents(texts, embeddings)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "34929595", | |
"metadata": { | |
"id": "34929595" | |
}, | |
"outputs": [], | |
"source": [ | |
"query = \"What is the top priority of a good data science team?\"\n", | |
"docs = docsearch.similarity_search(query)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "4e0f5b45", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "4e0f5b45", | |
"outputId": "187e38bc-2a34-4096-f41d-90b5bb9dd2a9" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"imagination should be the \n", | |
"hallmarks of Data Science. They \n", | |
"are fundamental to the success \n", | |
"of every Data Science project.\n" | |
] | |
} | |
], | |
"source": [ | |
"# Here's an example of the first document that was returned\n", | |
"print(docs[0].page_content[:450])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "3c35dcd9", | |
"metadata": { | |
"id": "3c35dcd9" | |
}, | |
"source": [ | |
"### Query those docs to get your answer back" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "f051337b", | |
"metadata": { | |
"id": "f051337b" | |
}, | |
"outputs": [], | |
"source": [ | |
"from langchain.llms import OpenAI\n", | |
"from langchain.chains.question_answering import load_qa_chain" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "6b9b1c03", | |
"metadata": { | |
"id": "6b9b1c03" | |
}, | |
"outputs": [], | |
"source": [ | |
"llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)\n", | |
"chain = load_qa_chain(llm, chain_type=\"stuff\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "f67ea7c2", | |
"metadata": { | |
"id": "f67ea7c2" | |
}, | |
"outputs": [], | |
"source": [ | |
"query = \"What is the collect stage of data maturity?\"\n", | |
"docs = docsearch.similarity_search(query)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "3dfd2b7d", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"id": "3dfd2b7d", | |
"outputId": "9e1c3694-18ca-4c2d-fb17-b6ae07645920" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"' The collect stage of data maturity focuses on collecting internal or external datasets. Gathering sales records and corresponding weather data is an example of the collect stage.'" | |
], | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
} | |
}, | |
"metadata": {}, | |
"execution_count": 46 | |
} | |
], | |
"source": [ | |
"chain.run(input_documents=docs, question=query)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.13" | |
}, | |
"colab": { | |
"provenance": [], | |
"include_colab_link": true | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment