Created
December 27, 2023 18:01
-
-
Save virattt/985a352b945a0e1164e91415f1ab2eeb to your computer and use it in GitHub Desktop.
langchain-per-ticker-rag
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/virattt/985a352b945a0e1164e91415f1ab2eeb/langchain-per-ticker-rag.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Step 0. Install dependencies" | |
], | |
"metadata": { | |
"id": "S2mGQxA958dW" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip install openai\n", | |
"!pip install pinecone-client\n", | |
"!pip install langchain\n", | |
"!pip install tiktoken\n", | |
"!pip install pypdf" | |
], | |
"metadata": { | |
"id": "2bY0NapN_z98" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Step 1. Load the SEC filings" | |
], | |
"metadata": { | |
"id": "XfKcntc4ZP7_" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n", | |
"from langchain.document_loaders import PyPDFLoader\n", | |
"\n", | |
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)" | |
], | |
"metadata": { | |
"id": "HpaLNifTakx0" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Load $AAPL's financial report. This may take 1-2 minutes since the PDF is large\n", | |
"aapl_10Q = \"https://d18rn0p25nwr6d.cloudfront.net/CIK-0000320193/f8aaeabb-7a2a-479d-bf72-9559ff51ea5d.pdf\"\n", | |
"\n", | |
"# Create your PDF loader\n", | |
"loader = PyPDFLoader(aapl_10Q)\n", | |
"\n", | |
"# Load the PDF document\n", | |
"aapl_documents = loader.load()\n", | |
"\n", | |
"# Chunk the financial report\n", | |
"docs = text_splitter.split_documents(aapl_documents)\n", | |
"aapl_texts = [d.page_content for d in docs]" | |
], | |
"metadata": { | |
"id": "qd_hnXNoZS8t" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Load $META's financial report. This may take 1-2 minutes since the PDF is large\n", | |
"meta_10Q = \"https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/ba763267-0ccb-4870-a7c5-e1bfd92a9ca7.pdf\"\n", | |
"\n", | |
"# Create your PDF loader\n", | |
"loader = PyPDFLoader(meta_10Q)\n", | |
"\n", | |
"# Load the PDF document\n", | |
"meta_documents = loader.load()\n", | |
"\n", | |
"# Chunk the financial report\n", | |
"docs = text_splitter.split_documents(meta_documents)\n", | |
"meta_texts = [d.page_content for d in docs]" | |
], | |
"metadata": { | |
"id": "_lhlnOiEZcPv" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Step 2. Set up vector store" | |
], | |
"metadata": { | |
"id": "bR6Iagsz6EE8" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import pinecone\n", | |
"from langchain.embeddings.openai import OpenAIEmbeddings\n", | |
"from langchain.vectorstores import Pinecone, Weaviate" | |
], | |
"metadata": { | |
"id": "zOpCZoiQWOPI" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# The environment should be the one specified next to the API key\n", | |
"# in your Pinecone console\n", | |
"pinecone.init(api_key=\"YOUR_PINECONE_API_KEY\", environment=\"YOUR_PINECONE_ENVIRONMENT\")\n", | |
"index = pinecone.Index(\"YOUR_PINECONE_INDEX\")\n", | |
"openai_api_key = 'YOUR_OPENAI_API_KEY'\n", | |
"embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)\n", | |
"vectorstore = Pinecone(index, embeddings, \"text\")" | |
], | |
"metadata": { | |
"id": "osPpAe0nWOkP" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Step 3. Add SEC filings to vector store" | |
], | |
"metadata": { | |
"id": "FQ5SRiyAXPTY" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"vectorstore.add_texts(aapl_texts, namespace=\"AAPL\")\n", | |
"vectorstore.add_texts(meta_texts, namespace=\"META\")" | |
], | |
"metadata": { | |
"id": "7o-qnJ1wX06V" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Step 4. Create Q&A Chain" | |
], | |
"metadata": { | |
"id": "8QmztXJsX-AW" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from langchain.chat_models import ChatOpenAI\n", | |
"from langchain.embeddings import OpenAIEmbeddings\n", | |
"from langchain.prompts import ChatPromptTemplate\n", | |
"from langchain_core.output_parsers import StrOutputParser\n", | |
"from langchain_core.runnables import (\n", | |
" ConfigurableField,\n", | |
" RunnableBinding,\n", | |
" RunnableLambda,\n", | |
" RunnablePassthrough,\n", | |
")" | |
], | |
"metadata": { | |
"id": "xUubjxAMX8kb" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# This is basic question-answering chain set up.\n", | |
"template = \"\"\"Answer the question based only on the following context:\n", | |
"{context}\n", | |
"Question: {question}\n", | |
"\"\"\"\n", | |
"prompt = ChatPromptTemplate.from_template(template)\n", | |
"\n", | |
"model = ChatOpenAI(openai_api_key=openai_api_key)\n", | |
"\n", | |
"retriever = vectorstore.as_retriever()" | |
], | |
"metadata": { | |
"id": "nGD2TzP-W29h" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Here we mark the retriever as having a configurable field. All vectorstore retrievers have search_kwargs as a field. This is just a dictionary, with vectorstore specific fields\n", | |
"configurable_retriever = retriever.configurable_fields(\n", | |
" search_kwargs=ConfigurableField(\n", | |
" id=\"search_kwargs\",\n", | |
" name=\"Search Kwargs\",\n", | |
" description=\"The search kwargs to use\",\n", | |
" )\n", | |
")" | |
], | |
"metadata": { | |
"id": "kFICvec0YEZK" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Create the chain\n", | |
"chain = (\n", | |
" {\"context\": configurable_retriever, \"question\": RunnablePassthrough()}\n", | |
" | prompt\n", | |
" | model\n", | |
" | StrOutputParser()\n", | |
")" | |
], | |
"metadata": { | |
"id": "y2Pu9wKKYXcz" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Step 5. Ask questions, by ticker" | |
], | |
"metadata": { | |
"id": "QeH1RS15cASK" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"chain.invoke(\n", | |
" \"What was revenue in July 2023?\",\n", | |
" config={\"configurable\": {\"search_kwargs\": {\"namespace\": \"AAPL\"}}},\n", | |
")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"id": "5euxR9BRYXuo", | |
"outputId": "77ed3501-bb39-4293-e6e4-a2efd7f525bc" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"'The revenue in July 2023 was $81,797 million.'" | |
], | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
} | |
}, | |
"metadata": {}, | |
"execution_count": 53 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"chain.invoke(\n", | |
" \"What was revenue in September 2023??\",\n", | |
" config={\"configurable\": {\"search_kwargs\": {\"namespace\": \"META\"}}},\n", | |
")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"id": "_2zNW58rYcij", | |
"outputId": "4c093f96-d1b4-4674-fc7e-5c74ff1e2919" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"'The revenue in September 2023 was $34.146 billion.'" | |
], | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
} | |
}, | |
"metadata": {}, | |
"execution_count": 54 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "wl8nn8s7Yd-H" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "base", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.12" | |
}, | |
"orig_nbformat": 4, | |
"colab": { | |
"provenance": [], | |
"include_colab_link": true | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment