virattt/langchain-per-ticker-rag.ipynb

## langchain-per-ticker-rag.ipynb
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/virattt/985a352b945a0e1164e91415f1ab2eeb/langchain-per-ticker-rag.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Step 0.  Install dependencies"
      ],
      "metadata": {
        "id": "S2mGQxA958dW"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install openai\n",
        "!pip install pinecone-client\n",
        "!pip install langchain\n",
        "!pip install tiktoken\n",
        "!pip install pypdf"
      ],
      "metadata": {
        "id": "2bY0NapN_z98"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Step 1. Load the SEC filings"
      ],
      "metadata": {
        "id": "XfKcntc4ZP7_"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
        "from langchain.document_loaders import PyPDFLoader\n",
        "\n",
        "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)"
      ],
      "metadata": {
        "id": "HpaLNifTakx0"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Load $AAPL's financial report. This may take 1-2 minutes since the PDF is large\n",
        "aapl_10Q = \"https://d18rn0p25nwr6d.cloudfront.net/CIK-0000320193/f8aaeabb-7a2a-479d-bf72-9559ff51ea5d.pdf\"\n",
        "\n",
        "# Create your PDF loader\n",
        "loader = PyPDFLoader(aapl_10Q)\n",
        "\n",
        "# Load the PDF document\n",
        "aapl_documents = loader.load()\n",
        "\n",
        "# Chunk the financial report\n",
        "docs = text_splitter.split_documents(aapl_documents)\n",
        "aapl_texts = [d.page_content for d in docs]"
      ],
      "metadata": {
        "id": "qd_hnXNoZS8t"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Load $META's financial report. This may take 1-2 minutes since the PDF is large\n",
        "meta_10Q = \"https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/ba763267-0ccb-4870-a7c5-e1bfd92a9ca7.pdf\"\n",
        "\n",
        "# Create your PDF loader\n",
        "loader = PyPDFLoader(meta_10Q)\n",
        "\n",
        "# Load the PDF document\n",
        "meta_documents = loader.load()\n",
        "\n",
        "# Chunk the financial report\n",
        "docs = text_splitter.split_documents(meta_documents)\n",
        "meta_texts = [d.page_content for d in docs]"
      ],
      "metadata": {
        "id": "_lhlnOiEZcPv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Step 2.  Set up vector store"
      ],
      "metadata": {
        "id": "bR6Iagsz6EE8"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import pinecone\n",
        "from langchain.embeddings.openai import OpenAIEmbeddings\n",
        "from langchain.vectorstores import Pinecone, Weaviate"
      ],
      "metadata": {
        "id": "zOpCZoiQWOPI"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# The environment should be the one specified next to the API key\n",
        "# in your Pinecone console\n",
        "pinecone.init(api_key=\"YOUR_PINECONE_API_KEY\", environment=\"YOUR_PINECONE_ENVIRONMENT\")\n",
        "index = pinecone.Index(\"YOUR_PINECONE_INDEX\")\n",
        "openai_api_key = 'YOUR_OPENAI_API_KEY'\n",
        "embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)\n",
        "vectorstore = Pinecone(index, embeddings, \"text\")"
      ],
      "metadata": {
        "id": "osPpAe0nWOkP"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Step 3. Add SEC filings to vector store"
      ],
      "metadata": {
        "id": "FQ5SRiyAXPTY"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "vectorstore.add_texts(aapl_texts, namespace=\"AAPL\")\n",
        "vectorstore.add_texts(meta_texts, namespace=\"META\")"
      ],
      "metadata": {
        "id": "7o-qnJ1wX06V"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Step 4. Create Q&A Chain"
      ],
      "metadata": {
        "id": "8QmztXJsX-AW"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.chat_models import ChatOpenAI\n",
        "from langchain.embeddings import OpenAIEmbeddings\n",
        "from langchain.prompts import ChatPromptTemplate\n",
        "from langchain_core.output_parsers import StrOutputParser\n",
        "from langchain_core.runnables import (\n",
        "    ConfigurableField,\n",
        "    RunnableBinding,\n",
        "    RunnableLambda,\n",
        "    RunnablePassthrough,\n",
        ")"
      ],
      "metadata": {
        "id": "xUubjxAMX8kb"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# This is basic question-answering chain set up.\n",
        "template = \"\"\"Answer the question based only on the following context:\n",
        "{context}\n",
        "Question: {question}\n",
        "\"\"\"\n",
        "prompt = ChatPromptTemplate.from_template(template)\n",
        "\n",
        "model = ChatOpenAI(openai_api_key=openai_api_key)\n",
        "\n",
        "retriever = vectorstore.as_retriever()"
      ],
      "metadata": {
        "id": "nGD2TzP-W29h"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Here we mark the retriever as having a configurable field. All vectorstore retrievers have search_kwargs as a field. This is just a dictionary, with vectorstore specific fields\n",
        "configurable_retriever = retriever.configurable_fields(\n",
        "    search_kwargs=ConfigurableField(\n",
        "        id=\"search_kwargs\",\n",
        "        name=\"Search Kwargs\",\n",
        "        description=\"The search kwargs to use\",\n",
        "    )\n",
        ")"
      ],
      "metadata": {
        "id": "kFICvec0YEZK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Create the chain\n",
        "chain = (\n",
        "    {\"context\": configurable_retriever, \"question\": RunnablePassthrough()}\n",
        "    | prompt\n",
        "    | model\n",
        "    | StrOutputParser()\n",
        ")"
      ],
      "metadata": {
        "id": "y2Pu9wKKYXcz"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Step 5. Ask questions, by ticker"
      ],
      "metadata": {
        "id": "QeH1RS15cASK"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "chain.invoke(\n",
        "    \"What was revenue in July 2023?\",\n",
        "    config={\"configurable\": {\"search_kwargs\": {\"namespace\": \"AAPL\"}}},\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "id": "5euxR9BRYXuo",
        "outputId": "77ed3501-bb39-4293-e6e4-a2efd7f525bc"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'The revenue in July 2023 was $81,797 million.'"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 53
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "chain.invoke(\n",
        "    \"What was revenue in September 2023??\",\n",
        "    config={\"configurable\": {\"search_kwargs\": {\"namespace\": \"META\"}}},\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "id": "_2zNW58rYcij",
        "outputId": "4c093f96-d1b4-4674-fc7e-5c74ff1e2919"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'The revenue in September 2023 was $34.146 billion.'"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 54
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "wl8nn8s7Yd-H"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "base",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.12"
    },
    "orig_nbformat": 4,
    "colab": {
      "provenance": [],
      "include_colab_link": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/virattt/985a352b945a0e1164e91415f1ab2eeb/langchain-per-ticker-rag.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Step 0. Install dependencies"
	],
	"metadata": {
	"id": "S2mGQxA958dW"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"!pip install openai\n",
	"!pip install pinecone-client\n",
	"!pip install langchain\n",
	"!pip install tiktoken\n",
	"!pip install pypdf"
	],
	"metadata": {
	"id": "2bY0NapN_z98"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Step 1. Load the SEC filings"
	],
	"metadata": {
	"id": "XfKcntc4ZP7_"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
	"from langchain.document_loaders import PyPDFLoader\n",
	"\n",
	"text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)"
	],
	"metadata": {
	"id": "HpaLNifTakx0"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Load $AAPL's financial report. This may take 1-2 minutes since the PDF is large\n",
	"aapl_10Q = \"https://d18rn0p25nwr6d.cloudfront.net/CIK-0000320193/f8aaeabb-7a2a-479d-bf72-9559ff51ea5d.pdf\"\n",
	"\n",
	"# Create your PDF loader\n",
	"loader = PyPDFLoader(aapl_10Q)\n",
	"\n",
	"# Load the PDF document\n",
	"aapl_documents = loader.load()\n",
	"\n",
	"# Chunk the financial report\n",
	"docs = text_splitter.split_documents(aapl_documents)\n",
	"aapl_texts = [d.page_content for d in docs]"
	],
	"metadata": {
	"id": "qd_hnXNoZS8t"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Load $META's financial report. This may take 1-2 minutes since the PDF is large\n",
	"meta_10Q = \"https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/ba763267-0ccb-4870-a7c5-e1bfd92a9ca7.pdf\"\n",
	"\n",
	"# Create your PDF loader\n",
	"loader = PyPDFLoader(meta_10Q)\n",
	"\n",
	"# Load the PDF document\n",
	"meta_documents = loader.load()\n",
	"\n",
	"# Chunk the financial report\n",
	"docs = text_splitter.split_documents(meta_documents)\n",
	"meta_texts = [d.page_content for d in docs]"
	],
	"metadata": {
	"id": "_lhlnOiEZcPv"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Step 2. Set up vector store"
	],
	"metadata": {
	"id": "bR6Iagsz6EE8"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"import pinecone\n",
	"from langchain.embeddings.openai import OpenAIEmbeddings\n",
	"from langchain.vectorstores import Pinecone, Weaviate"
	],
	"metadata": {
	"id": "zOpCZoiQWOPI"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# The environment should be the one specified next to the API key\n",
	"# in your Pinecone console\n",
	"pinecone.init(api_key=\"YOUR_PINECONE_API_KEY\", environment=\"YOUR_PINECONE_ENVIRONMENT\")\n",
	"index = pinecone.Index(\"YOUR_PINECONE_INDEX\")\n",
	"openai_api_key = 'YOUR_OPENAI_API_KEY'\n",
	"embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)\n",
	"vectorstore = Pinecone(index, embeddings, \"text\")"
	],
	"metadata": {
	"id": "osPpAe0nWOkP"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Step 3. Add SEC filings to vector store"
	],
	"metadata": {
	"id": "FQ5SRiyAXPTY"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"vectorstore.add_texts(aapl_texts, namespace=\"AAPL\")\n",
	"vectorstore.add_texts(meta_texts, namespace=\"META\")"
	],
	"metadata": {
	"id": "7o-qnJ1wX06V"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Step 4. Create Q&A Chain"
	],
	"metadata": {
	"id": "8QmztXJsX-AW"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"from langchain.chat_models import ChatOpenAI\n",
	"from langchain.embeddings import OpenAIEmbeddings\n",
	"from langchain.prompts import ChatPromptTemplate\n",
	"from langchain_core.output_parsers import StrOutputParser\n",
	"from langchain_core.runnables import (\n",
	" ConfigurableField,\n",
	" RunnableBinding,\n",
	" RunnableLambda,\n",
	" RunnablePassthrough,\n",
	")"
	],
	"metadata": {
	"id": "xUubjxAMX8kb"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# This is basic question-answering chain set up.\n",
	"template = \"\"\"Answer the question based only on the following context:\n",
	"{context}\n",
	"Question: {question}\n",
	"\"\"\"\n",
	"prompt = ChatPromptTemplate.from_template(template)\n",
	"\n",
	"model = ChatOpenAI(openai_api_key=openai_api_key)\n",
	"\n",
	"retriever = vectorstore.as_retriever()"
	],
	"metadata": {
	"id": "nGD2TzP-W29h"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Here we mark the retriever as having a configurable field. All vectorstore retrievers have search_kwargs as a field. This is just a dictionary, with vectorstore specific fields\n",
	"configurable_retriever = retriever.configurable_fields(\n",
	" search_kwargs=ConfigurableField(\n",
	" id=\"search_kwargs\",\n",
	" name=\"Search Kwargs\",\n",
	" description=\"The search kwargs to use\",\n",
	" )\n",
	")"
	],
	"metadata": {
	"id": "kFICvec0YEZK"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Create the chain\n",
	"chain = (\n",
	" {\"context\": configurable_retriever, \"question\": RunnablePassthrough()}\n",
	" \| prompt\n",
	" \| model\n",
	" \| StrOutputParser()\n",
	")"
	],
	"metadata": {
	"id": "y2Pu9wKKYXcz"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Step 5. Ask questions, by ticker"
	],
	"metadata": {
	"id": "QeH1RS15cASK"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"chain.invoke(\n",
	" \"What was revenue in July 2023?\",\n",
	" config={\"configurable\": {\"search_kwargs\": {\"namespace\": \"AAPL\"}}},\n",
	")"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 35
	},
	"id": "5euxR9BRYXuo",
	"outputId": "77ed3501-bb39-4293-e6e4-a2efd7f525bc"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"'The revenue in July 2023 was $81,797 million.'"
	],
	"application/vnd.google.colaboratory.intrinsic+json": {
	"type": "string"
	}
	},
	"metadata": {},
	"execution_count": 53
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"chain.invoke(\n",
	" \"What was revenue in September 2023??\",\n",
	" config={\"configurable\": {\"search_kwargs\": {\"namespace\": \"META\"}}},\n",
	")"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 35
	},
	"id": "_2zNW58rYcij",
	"outputId": "4c093f96-d1b4-4674-fc7e-5c74ff1e2919"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"'The revenue in September 2023 was $34.146 billion.'"
	],
	"application/vnd.google.colaboratory.intrinsic+json": {
	"type": "string"
	}
	},
	"metadata": {},
	"execution_count": 54
	}
	]
	},
	{
	"cell_type": "code",
	"source": [],
	"metadata": {
	"id": "wl8nn8s7Yd-H"
	},
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "base",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.12"
	},
	"orig_nbformat": 4,
	"colab": {
	"provenance": [],
	"include_colab_link": true
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}