Daethyra/ask-a-book-questions.ipynb

## ask-a-book-questions.ipynb
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/Daethyra/3c2a1ab8bda6e326513d52a77d6b5ea7/ask-a-book-questions.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "9d615a77",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "9d615a77",
        "outputId": "97147eb9-1846-4411-c649-293732203fba"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: langchain in /usr/local/lib/python3.10/dist-packages (0.0.309)\n",
            "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n",
            "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.21)\n",
            "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.8.5)\n",
            "Requirement already satisfied: anyio<4.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.7.1)\n",
            "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n",
            "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.6.1)\n",
            "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.33)\n",
            "Requirement already satisfied: langsmith<0.1.0,>=0.0.40 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.0.42)\n",
            "Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.23.5)\n",
            "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.10.13)\n",
            "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n",
            "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.2.3)\n",
            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.1.0)\n",
            "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (3.3.0)\n",
            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.4)\n",
            "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.2)\n",
            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.0)\n",
            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n",
            "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->langchain) (3.4)\n",
            "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->langchain) (1.3.0)\n",
            "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->langchain) (1.1.3)\n",
            "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (3.20.1)\n",
            "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (0.9.0)\n",
            "Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain) (2.4)\n",
            "Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (4.5.0)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2.0.6)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2023.7.22)\n",
            "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.0)\n",
            "Requirement already satisfied: packaging>=17.0 in /usr/local/lib/python3.10/dist-packages (from marshmallow<4.0.0,>=3.18.0->dataclasses-json<0.7,>=0.5.7->langchain) (23.2)\n",
            "Requirement already satisfied: mypy-extensions>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain) (1.0.0)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h/content\n",
            "total 13136\n",
            "drwxr-xr-x  4 root root     4096 Oct  4 13:22 .config\n",
            "drwxr-xr-x  1 root root     4096 Oct  4 13:23 sample_data\n",
            "drwxr-xr-x  1 root root     4096 Oct  5 23:14 ..\n",
            "-rw-r--r--  1 root root 13422975 Oct  5 23:16 field-guide-to-data-science.pdf\n",
            "-rw-r--r--  1 root root      125 Oct  5 23:18 .env\n",
            "drwxr-xr-x  1 root root     4096 Oct  5 23:21 .\n",
            "drwxr-xr-x 11 root root     4096 Oct  5 23:21 langchain-tutorials\n",
            "fatal: destination path 'langchain-tutorials' already exists and is not an empty directory.\n"
          ]
        }
      ],
      "source": [
        "!pip install langchain --upgrade\n",
        "# Version: 0.0.164\n",
        "\n",
        "# Install necessary packages and upgrade outdated packages\n",
        "!pip install -qU pinecone-client python-dotenv pypdf openai chromadb tiktoken\n",
        "\n",
        "# Install Greg's LangChain repository which contains the data/ folder to work with. Requires arrangement in Google Drive's directory.\n",
        "!pwd\n",
        "!ls -ltra\n",
        "!git clone https://github.com/gkamradt/langchain-tutorials.git\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "2d3e92ed",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "2d3e92ed",
        "outputId": "ee1273db-f3ca-4755-ec54-dad5463e35b8"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {},
          "execution_count": 19
        }
      ],
      "source": [
        "# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader\n",
        "from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader\n",
        "\n",
        "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
        "import os\n",
        "from dotenv import load_dotenv\n",
        "\n",
        "load_dotenv()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "5166d759",
      "metadata": {
        "id": "5166d759"
      },
      "source": [
        "### Load your data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "b4a2d6bf",
      "metadata": {
        "id": "b4a2d6bf"
      },
      "outputs": [],
      "source": [
        "loader = PyPDFLoader(\"./field-guide-to-data-science.pdf\")\n",
        "\n",
        "## Other options for loaders\n",
        "# loader = UnstructuredPDFLoader(\"../data/field-guide-to-data-science.pdf\")\n",
        "# loader = OnlinePDFLoader(\"https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "bcdac23c",
      "metadata": {
        "id": "bcdac23c"
      },
      "outputs": [],
      "source": [
        "data = loader.load()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "b4fd7c9e",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "b4fd7c9e",
        "outputId": "04512c6b-9503-4d14-e479-d68dccf51892"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "You have 126 document(s) in your data\n",
            "There are 2812 characters in your document\n"
          ]
        }
      ],
      "source": [
        "# Note: If you're using PyPDFLoader then it will split by page for you already\n",
        "print (f'You have {len(data)} document(s) in your data')\n",
        "print (f'There are {len(data[30].page_content)} characters in your document')"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8af9b604",
      "metadata": {
        "id": "8af9b604"
      },
      "source": [
        "### Chunk your data up into smaller documents"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "fb3c6f02",
      "metadata": {
        "id": "fb3c6f02"
      },
      "outputs": [],
      "source": [
        "# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.\n",
        "# This is optional, test out on your own data.\n",
        "\n",
        "text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)\n",
        "texts = text_splitter.split_documents(data)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "879873a4",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "879873a4",
        "outputId": "59fd6073-d29a-47fb-c738-156226ef2f73"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Now you have 162 documents\n"
          ]
        }
      ],
      "source": [
        "print (f'Now you have {len(texts)} documents')"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "838b2843",
      "metadata": {
        "id": "838b2843"
      },
      "source": [
        "### Create embeddings of your documents to get ready for semantic search"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "373e695a",
      "metadata": {
        "id": "373e695a"
      },
      "outputs": [],
      "source": [
        "from langchain.vectorstores import Chroma, Pinecone\n",
        "from langchain.embeddings.openai import OpenAIEmbeddings\n",
        "import pinecone"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "884e7857",
      "metadata": {
        "id": "884e7857"
      },
      "source": [
        "Check to see if there is an environment variable with you API keys, if not, use what you put below"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "42a1d5c3",
      "metadata": {
        "hide_input": false,
        "id": "42a1d5c3"
      },
      "outputs": [],
      "source": [
        "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'sk-')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "b4619d3a",
      "metadata": {
        "id": "b4619d3a"
      },
      "outputs": [],
      "source": [
        "embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "b73d8504",
      "metadata": {
        "id": "b73d8504"
      },
      "source": [
        "### Option #1: Pinecone\n",
        "If you want to use pinecone, run the code below, if not then skip over to Chroma below it. You must go to [Pinecone.io](https://www.pinecone.io/) and set up an account"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0e093ef3",
      "metadata": {
        "hide_input": false,
        "id": "0e093ef3"
      },
      "outputs": [],
      "source": [
        "PINECONE_API_KEY = os.getenv('PINECONE_API_KEY', 'YourAPIKey')\n",
        "PINECONE_API_ENV = os.getenv('PINECONE_API_ENV', 'us-east1-gcp') # You may need to switch with your env\n",
        "\n",
        "# initialize pinecone\n",
        "pinecone.init(\n",
        "    api_key=PINECONE_API_KEY,  # find at app.pinecone.io\n",
        "    environment=PINECONE_API_ENV  # next to api key in console\n",
        ")\n",
        "index_name = \"langchaintest\" # put in the name of your pinecone index here\n",
        "\n",
        "docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "76d66c06",
      "metadata": {
        "id": "76d66c06"
      },
      "source": [
        "### Option #2: Chroma\n",
        "\n",
        "I like Chroma becauase it's local and easy to set up without an account"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4e0d1c6a",
      "metadata": {
        "id": "4e0d1c6a"
      },
      "outputs": [],
      "source": [
        "# load it into Chroma\n",
        "docsearch = Chroma.from_documents(texts, embeddings)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "34929595",
      "metadata": {
        "id": "34929595"
      },
      "outputs": [],
      "source": [
        "query = \"What is the top priority of a good data science team?\"\n",
        "docs = docsearch.similarity_search(query)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4e0f5b45",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4e0f5b45",
        "outputId": "187e38bc-2a34-4096-f41d-90b5bb9dd2a9"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "imagination should be the \n",
            "hallmarks of Data Science. They \n",
            "are fundamental to the success \n",
            "of every Data Science project.\n"
          ]
        }
      ],
      "source": [
        "# Here's an example of the first document that was returned\n",
        "print(docs[0].page_content[:450])"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3c35dcd9",
      "metadata": {
        "id": "3c35dcd9"
      },
      "source": [
        "### Query those docs to get your answer back"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f051337b",
      "metadata": {
        "id": "f051337b"
      },
      "outputs": [],
      "source": [
        "from langchain.llms import OpenAI\n",
        "from langchain.chains.question_answering import load_qa_chain"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6b9b1c03",
      "metadata": {
        "id": "6b9b1c03"
      },
      "outputs": [],
      "source": [
        "llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)\n",
        "chain = load_qa_chain(llm, chain_type=\"stuff\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f67ea7c2",
      "metadata": {
        "id": "f67ea7c2"
      },
      "outputs": [],
      "source": [
        "query = \"What is the collect stage of data maturity?\"\n",
        "docs = docsearch.similarity_search(query)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "3dfd2b7d",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "id": "3dfd2b7d",
        "outputId": "9e1c3694-18ca-4c2d-fb17-b6ae07645920"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "' The collect stage of data maturity focuses on collecting internal or external datasets. Gathering sales records and corresponding weather data is an example of the collect stage.'"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 46
        }
      ],
      "source": [
        "chain.run(input_documents=docs, question=query)"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.13"
    },
    "colab": {
      "provenance": [],
      "include_colab_link": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/Daethyra/3c2a1ab8bda6e326513d52a77d6b5ea7/ask-a-book-questions.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "9d615a77",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "9d615a77",
	"outputId": "97147eb9-1846-4411-c649-293732203fba"
	},
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Requirement already satisfied: langchain in /usr/local/lib/python3.10/dist-packages (0.0.309)\n",
	"Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n",
	"Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.21)\n",
	"Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.8.5)\n",
	"Requirement already satisfied: anyio<4.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.7.1)\n",
	"Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n",
	"Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.6.1)\n",
	"Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.33)\n",
	"Requirement already satisfied: langsmith<0.1.0,>=0.0.40 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.0.42)\n",
	"Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.23.5)\n",
	"Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.10.13)\n",
	"Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n",
	"Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.2.3)\n",
	"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.1.0)\n",
	"Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (3.3.0)\n",
	"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.4)\n",
	"Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.2)\n",
	"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.0)\n",
	"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n",
	"Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->langchain) (3.4)\n",
	"Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->langchain) (1.3.0)\n",
	"Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->langchain) (1.1.3)\n",
	"Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (3.20.1)\n",
	"Requirement already satisfied: typing-inspect<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (0.9.0)\n",
	"Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain) (2.4)\n",
	"Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (4.5.0)\n",
	"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2.0.6)\n",
	"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2023.7.22)\n",
	"Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.0)\n",
	"Requirement already satisfied: packaging>=17.0 in /usr/local/lib/python3.10/dist-packages (from marshmallow<4.0.0,>=3.18.0->dataclasses-json<0.7,>=0.5.7->langchain) (23.2)\n",
	"Requirement already satisfied: mypy-extensions>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain) (1.0.0)\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[?25h/content\n",
	"total 13136\n",
	"drwxr-xr-x 4 root root 4096 Oct 4 13:22 .config\n",
	"drwxr-xr-x 1 root root 4096 Oct 4 13:23 sample_data\n",
	"drwxr-xr-x 1 root root 4096 Oct 5 23:14 ..\n",
	"-rw-r--r-- 1 root root 13422975 Oct 5 23:16 field-guide-to-data-science.pdf\n",
	"-rw-r--r-- 1 root root 125 Oct 5 23:18 .env\n",
	"drwxr-xr-x 1 root root 4096 Oct 5 23:21 .\n",
	"drwxr-xr-x 11 root root 4096 Oct 5 23:21 langchain-tutorials\n",
	"fatal: destination path 'langchain-tutorials' already exists and is not an empty directory.\n"
	]
	}
	],
	"source": [
	"!pip install langchain --upgrade\n",
	"# Version: 0.0.164\n",
	"\n",
	"# Install necessary packages and upgrade outdated packages\n",
	"!pip install -qU pinecone-client python-dotenv pypdf openai chromadb tiktoken\n",
	"\n",
	"# Install Greg's LangChain repository which contains the data/ folder to work with. Requires arrangement in Google Drive's directory.\n",
	"!pwd\n",
	"!ls -ltra\n",
	"!git clone https://github.com/gkamradt/langchain-tutorials.git\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "2d3e92ed",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "2d3e92ed",
	"outputId": "ee1273db-f3ca-4755-ec54-dad5463e35b8"
	},
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"True"
	]
	},
	"metadata": {},
	"execution_count": 19
	}
	],
	"source": [
	"# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader\n",
	"from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader\n",
	"\n",
	"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
	"import os\n",
	"from dotenv import load_dotenv\n",
	"\n",
	"load_dotenv()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "5166d759",
	"metadata": {
	"id": "5166d759"
	},
	"source": [
	"### Load your data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "b4a2d6bf",
	"metadata": {
	"id": "b4a2d6bf"
	},
	"outputs": [],
	"source": [
	"loader = PyPDFLoader(\"./field-guide-to-data-science.pdf\")\n",
	"\n",
	"## Other options for loaders\n",
	"# loader = UnstructuredPDFLoader(\"../data/field-guide-to-data-science.pdf\")\n",
	"# loader = OnlinePDFLoader(\"https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "bcdac23c",
	"metadata": {
	"id": "bcdac23c"
	},
	"outputs": [],
	"source": [
	"data = loader.load()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "b4fd7c9e",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "b4fd7c9e",
	"outputId": "04512c6b-9503-4d14-e479-d68dccf51892"
	},
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"You have 126 document(s) in your data\n",
	"There are 2812 characters in your document\n"
	]
	}
	],
	"source": [
	"# Note: If you're using PyPDFLoader then it will split by page for you already\n",
	"print (f'You have {len(data)} document(s) in your data')\n",
	"print (f'There are {len(data[30].page_content)} characters in your document')"
	]
	},
	{
	"cell_type": "markdown",
	"id": "8af9b604",
	"metadata": {
	"id": "8af9b604"
	},
	"source": [
	"### Chunk your data up into smaller documents"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "fb3c6f02",
	"metadata": {
	"id": "fb3c6f02"
	},
	"outputs": [],
	"source": [
	"# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.\n",
	"# This is optional, test out on your own data.\n",
	"\n",
	"text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)\n",
	"texts = text_splitter.split_documents(data)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "879873a4",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "879873a4",
	"outputId": "59fd6073-d29a-47fb-c738-156226ef2f73"
	},
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Now you have 162 documents\n"
	]
	}
	],
	"source": [
	"print (f'Now you have {len(texts)} documents')"
	]
	},
	{
	"cell_type": "markdown",
	"id": "838b2843",
	"metadata": {
	"id": "838b2843"
	},
	"source": [
	"### Create embeddings of your documents to get ready for semantic search"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "373e695a",
	"metadata": {
	"id": "373e695a"
	},
	"outputs": [],
	"source": [
	"from langchain.vectorstores import Chroma, Pinecone\n",
	"from langchain.embeddings.openai import OpenAIEmbeddings\n",
	"import pinecone"
	]
	},
	{
	"cell_type": "markdown",
	"id": "884e7857",
	"metadata": {
	"id": "884e7857"
	},
	"source": [
	"Check to see if there is an environment variable with you API keys, if not, use what you put below"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "42a1d5c3",
	"metadata": {
	"hide_input": false,
	"id": "42a1d5c3"
	},
	"outputs": [],
	"source": [
	"OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'sk-')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "b4619d3a",
	"metadata": {
	"id": "b4619d3a"
	},
	"outputs": [],
	"source": [
	"embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "b73d8504",
	"metadata": {
	"id": "b73d8504"
	},
	"source": [
	"### Option #1: Pinecone\n",
	"If you want to use pinecone, run the code below, if not then skip over to Chroma below it. You must go to [Pinecone.io](https://www.pinecone.io/) and set up an account"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "0e093ef3",
	"metadata": {
	"hide_input": false,
	"id": "0e093ef3"
	},
	"outputs": [],
	"source": [
	"PINECONE_API_KEY = os.getenv('PINECONE_API_KEY', 'YourAPIKey')\n",
	"PINECONE_API_ENV = os.getenv('PINECONE_API_ENV', 'us-east1-gcp') # You may need to switch with your env\n",
	"\n",
	"# initialize pinecone\n",
	"pinecone.init(\n",
	" api_key=PINECONE_API_KEY, # find at app.pinecone.io\n",
	" environment=PINECONE_API_ENV # next to api key in console\n",
	")\n",
	"index_name = \"langchaintest\" # put in the name of your pinecone index here\n",
	"\n",
	"docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "76d66c06",
	"metadata": {
	"id": "76d66c06"
	},
	"source": [
	"### Option #2: Chroma\n",
	"\n",
	"I like Chroma becauase it's local and easy to set up without an account"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "4e0d1c6a",
	"metadata": {
	"id": "4e0d1c6a"
	},
	"outputs": [],
	"source": [
	"# load it into Chroma\n",
	"docsearch = Chroma.from_documents(texts, embeddings)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "34929595",
	"metadata": {
	"id": "34929595"
	},
	"outputs": [],
	"source": [
	"query = \"What is the top priority of a good data science team?\"\n",
	"docs = docsearch.similarity_search(query)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "4e0f5b45",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "4e0f5b45",
	"outputId": "187e38bc-2a34-4096-f41d-90b5bb9dd2a9"
	},
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"imagination should be the \n",
	"hallmarks of Data Science. They \n",
	"are fundamental to the success \n",
	"of every Data Science project.\n"
	]
	}
	],
	"source": [
	"# Here's an example of the first document that was returned\n",
	"print(docs[0].page_content[:450])"
	]
	},
	{
	"cell_type": "markdown",
	"id": "3c35dcd9",
	"metadata": {
	"id": "3c35dcd9"
	},
	"source": [
	"### Query those docs to get your answer back"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "f051337b",
	"metadata": {
	"id": "f051337b"
	},
	"outputs": [],
	"source": [
	"from langchain.llms import OpenAI\n",
	"from langchain.chains.question_answering import load_qa_chain"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "6b9b1c03",
	"metadata": {
	"id": "6b9b1c03"
	},
	"outputs": [],
	"source": [
	"llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)\n",
	"chain = load_qa_chain(llm, chain_type=\"stuff\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "f67ea7c2",
	"metadata": {
	"id": "f67ea7c2"
	},
	"outputs": [],
	"source": [
	"query = \"What is the collect stage of data maturity?\"\n",
	"docs = docsearch.similarity_search(query)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "3dfd2b7d",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 35
	},
	"id": "3dfd2b7d",
	"outputId": "9e1c3694-18ca-4c2d-fb17-b6ae07645920"
	},
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"' The collect stage of data maturity focuses on collecting internal or external datasets. Gathering sales records and corresponding weather data is an example of the collect stage.'"
	],
	"application/vnd.google.colaboratory.intrinsic+json": {
	"type": "string"
	}
	},
	"metadata": {},
	"execution_count": 46
	}
	],
	"source": [
	"chain.run(input_documents=docs, question=query)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.13"
	},
	"colab": {
	"provenance": [],
	"include_colab_link": true
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}