Skip to content

Instantly share code, notes, and snippets.

@virattt
Created December 27, 2023 18:01
Show Gist options
  • Save virattt/985a352b945a0e1164e91415f1ab2eeb to your computer and use it in GitHub Desktop.
Save virattt/985a352b945a0e1164e91415f1ab2eeb to your computer and use it in GitHub Desktop.
langchain-per-ticker-rag
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/virattt/985a352b945a0e1164e91415f1ab2eeb/langchain-per-ticker-rag.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"# Step 0. Install dependencies"
],
"metadata": {
"id": "S2mGQxA958dW"
}
},
{
"cell_type": "code",
"source": [
"!pip install openai\n",
"!pip install pinecone-client\n",
"!pip install langchain\n",
"!pip install tiktoken\n",
"!pip install pypdf"
],
"metadata": {
"id": "2bY0NapN_z98"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Step 1. Load the SEC filings"
],
"metadata": {
"id": "XfKcntc4ZP7_"
}
},
{
"cell_type": "code",
"source": [
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"from langchain.document_loaders import PyPDFLoader\n",
"\n",
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)"
],
"metadata": {
"id": "HpaLNifTakx0"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Load $AAPL's financial report. This may take 1-2 minutes since the PDF is large\n",
"aapl_10Q = \"https://d18rn0p25nwr6d.cloudfront.net/CIK-0000320193/f8aaeabb-7a2a-479d-bf72-9559ff51ea5d.pdf\"\n",
"\n",
"# Create your PDF loader\n",
"loader = PyPDFLoader(aapl_10Q)\n",
"\n",
"# Load the PDF document\n",
"aapl_documents = loader.load()\n",
"\n",
"# Chunk the financial report\n",
"docs = text_splitter.split_documents(aapl_documents)\n",
"aapl_texts = [d.page_content for d in docs]"
],
"metadata": {
"id": "qd_hnXNoZS8t"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Load $META's financial report. This may take 1-2 minutes since the PDF is large\n",
"meta_10Q = \"https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/ba763267-0ccb-4870-a7c5-e1bfd92a9ca7.pdf\"\n",
"\n",
"# Create your PDF loader\n",
"loader = PyPDFLoader(meta_10Q)\n",
"\n",
"# Load the PDF document\n",
"meta_documents = loader.load()\n",
"\n",
"# Chunk the financial report\n",
"docs = text_splitter.split_documents(meta_documents)\n",
"meta_texts = [d.page_content for d in docs]"
],
"metadata": {
"id": "_lhlnOiEZcPv"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Step 2. Set up vector store"
],
"metadata": {
"id": "bR6Iagsz6EE8"
}
},
{
"cell_type": "code",
"source": [
"import pinecone\n",
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.vectorstores import Pinecone, Weaviate"
],
"metadata": {
"id": "zOpCZoiQWOPI"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# The environment should be the one specified next to the API key\n",
"# in your Pinecone console\n",
"pinecone.init(api_key=\"YOUR_PINECONE_API_KEY\", environment=\"YOUR_PINECONE_ENVIRONMENT\")\n",
"index = pinecone.Index(\"YOUR_PINECONE_INDEX\")\n",
"openai_api_key = 'YOUR_OPENAI_API_KEY'\n",
"embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)\n",
"vectorstore = Pinecone(index, embeddings, \"text\")"
],
"metadata": {
"id": "osPpAe0nWOkP"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Step 3. Add SEC filings to vector store"
],
"metadata": {
"id": "FQ5SRiyAXPTY"
}
},
{
"cell_type": "code",
"source": [
"vectorstore.add_texts(aapl_texts, namespace=\"AAPL\")\n",
"vectorstore.add_texts(meta_texts, namespace=\"META\")"
],
"metadata": {
"id": "7o-qnJ1wX06V"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Step 4. Create Q&A Chain"
],
"metadata": {
"id": "8QmztXJsX-AW"
}
},
{
"cell_type": "code",
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.prompts import ChatPromptTemplate\n",
"from langchain_core.output_parsers import StrOutputParser\n",
"from langchain_core.runnables import (\n",
" ConfigurableField,\n",
" RunnableBinding,\n",
" RunnableLambda,\n",
" RunnablePassthrough,\n",
")"
],
"metadata": {
"id": "xUubjxAMX8kb"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# This is basic question-answering chain set up.\n",
"template = \"\"\"Answer the question based only on the following context:\n",
"{context}\n",
"Question: {question}\n",
"\"\"\"\n",
"prompt = ChatPromptTemplate.from_template(template)\n",
"\n",
"model = ChatOpenAI(openai_api_key=openai_api_key)\n",
"\n",
"retriever = vectorstore.as_retriever()"
],
"metadata": {
"id": "nGD2TzP-W29h"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Here we mark the retriever as having a configurable field. All vectorstore retrievers have search_kwargs as a field. This is just a dictionary, with vectorstore specific fields\n",
"configurable_retriever = retriever.configurable_fields(\n",
" search_kwargs=ConfigurableField(\n",
" id=\"search_kwargs\",\n",
" name=\"Search Kwargs\",\n",
" description=\"The search kwargs to use\",\n",
" )\n",
")"
],
"metadata": {
"id": "kFICvec0YEZK"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Create the chain\n",
"chain = (\n",
" {\"context\": configurable_retriever, \"question\": RunnablePassthrough()}\n",
" | prompt\n",
" | model\n",
" | StrOutputParser()\n",
")"
],
"metadata": {
"id": "y2Pu9wKKYXcz"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Step 5. Ask questions, by ticker"
],
"metadata": {
"id": "QeH1RS15cASK"
}
},
{
"cell_type": "code",
"source": [
"chain.invoke(\n",
" \"What was revenue in July 2023?\",\n",
" config={\"configurable\": {\"search_kwargs\": {\"namespace\": \"AAPL\"}}},\n",
")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"id": "5euxR9BRYXuo",
"outputId": "77ed3501-bb39-4293-e6e4-a2efd7f525bc"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'The revenue in July 2023 was $81,797 million.'"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 53
}
]
},
{
"cell_type": "code",
"source": [
"chain.invoke(\n",
" \"What was revenue in September 2023??\",\n",
" config={\"configurable\": {\"search_kwargs\": {\"namespace\": \"META\"}}},\n",
")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"id": "_2zNW58rYcij",
"outputId": "4c093f96-d1b4-4674-fc7e-5c74ff1e2919"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'The revenue in September 2023 was $34.146 billion.'"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 54
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "wl8nn8s7Yd-H"
},
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4,
"colab": {
"provenance": [],
"include_colab_link": true
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment