ahpoise/youtube_transcript.ipynb

## youtube_transcript.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/ahpoise/f7a03db772be8aab76953e2704bf1118/youtube_transcript.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Setup better response formatting (adds line wrap)\n",
        "from IPython.display import HTML, display\n",
        "\n",
        "def set_css():\n",
        "  display(HTML('''\n",
        "  <style>\n",
        "    pre {\n",
        "        white-space: pre-wrap;\n",
        "    }\n",
        "  </style>\n",
        "  '''))\n",
        "get_ipython().events.register('pre_run_cell', set_css)"
      ],
      "metadata": {
        "id": "Q7cVGW9TCwkQ",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Environment Variables\n",
        "import os\n",
        "from google.colab import userdata\n",
        "\n",
        "os.environ[\"PINECONE_API_KEY\"] = userdata.get('PINECONE_API_KEY')\n",
        "os.environ[\"OPENAI_API_KEY\"] = userdata.get('OPENAI_API_KEY')"
      ],
      "metadata": {
        "id": "rQXZ49rLipo6"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "yCeAYzyP-pik"
      },
      "outputs": [],
      "source": [
        "#@title Install deps\n",
        "!pip install langchain -qqq\n",
        "!pip install langchain-core -qqq\n",
        "!pip install langchain_community -qqq\n",
        "!pip install langchain-pinecone -qqq\n",
        "!pip install langchain-text-splitters -qqq\n",
        "!pip install langchain-openai -qqq\n",
        "!pip install youtube_transcript_api -qqq"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Download an example transcript from a YT video\n",
        "from youtube_transcript_api import YouTubeTranscriptApi\n",
        "\n",
        "srt = YouTubeTranscriptApi.get_transcript(\"L-BX5AjGhlw\") # CHANGE THE ID OF THE VIDEO\n",
        "\n",
        "with open(\"/content/youtube_transcription.txt\", \"a\") as file:\n",
        "    for i in srt:\n",
        "        file.write(i['text'])"
      ],
      "metadata": {
        "id": "1IXZ7_RZ-5eB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Initialize Model\n",
        "from google.colab import userdata\n",
        "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n",
        "\n",
        "model = \"gpt-4o\"\n",
        "\n",
        "llm = ChatOpenAI(\n",
        "    model=model,\n",
        ")\n",
        "\n",
        "embeddings = OpenAIEmbeddings()"
      ],
      "metadata": {
        "id": "VafWv0xkAWO4"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Load the previously created .txt file\n",
        "from langchain_community.document_loaders import TextLoader\n",
        "\n",
        "loader = TextLoader(\"/content/youtube_transcription.txt\")\n",
        "text_documents = loader.load()\n",
        "\n",
        "print(text_documents)"
      ],
      "metadata": {
        "id": "3mDG-ttPCKl6"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Split the file into chunks\n",
        "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
        "\n",
        "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)\n",
        "text_documents = text_splitter.split_documents(text_documents)\n",
        "\n",
        "print(text_documents)"
      ],
      "metadata": {
        "id": "HttpvtAOCz8R"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Add the docs to the vector db\n",
        "\n",
        "from pinecone import Pinecone\n",
        "from langchain_pinecone import PineconeVectorStore\n",
        "\n",
        "# Don't forget that you have to have an index already created\n",
        "\n",
        "print(\"Ingesting...\")\n",
        "pc = Pinecone(\n",
        ")\n",
        "\n",
        "vector_store = PineconeVectorStore.from_documents(\n",
        "  text_documents,\n",
        "  embeddings,\n",
        "  index_name=\"youtube-transcript\"\n",
        ")\n",
        "print(\"Ingestion finished.\")"
      ],
      "metadata": {
        "id": "lyD2_1vdHh6Z"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Clean up\n",
        "\n",
        "import os\n",
        "if os.path.exists(\"/content/youtube_transcription.txt\"):\n",
        "  os.remove(\"/content/youtube_transcription.txt\")\n",
        "else:\n",
        "  print(\"The file does not exist\")"
      ],
      "metadata": {
        "id": "PIzmde6VR-ic"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Use vector store as a retriever\n",
        "\n",
        "retriever = vector_store.as_retriever()"
      ],
      "metadata": {
        "id": "0Zjslun9INSy"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Instantiate the parser\n",
        "\n",
        "from langchain_core.output_parsers import StrOutputParser\n",
        "\n",
        "parser = StrOutputParser()"
      ],
      "metadata": {
        "id": "tyVgqdvYIV0d"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Prompt Template\n",
        "\n",
        "from langchain.prompts import PromptTemplate\n",
        "\n",
        "template = \"\"\"\n",
        "Act as top companies and stock market analyst. \\\n",
        "Answer the question based on the context below.\\\n",
        "If you can't answer the question, answer with \"I don't know\".\\\n",
        "\n",
        "Context: {context}\n",
        "\n",
        "Question: {question}\n",
        "\"\"\"\n",
        "\n",
        "prompt = PromptTemplate.from_template(template)\n",
        "prompt.format(context=\"Here is some context\", question=\"Here is a question\")"
      ],
      "metadata": {
        "id": "pgi400cnH3jH"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Get information about the video\n",
        "from langchain_core.prompts import PromptTemplate\n",
        "\n",
        "questions = [\n",
        "    \"Why are nvidia chips so good?\",\n",
        "    \"How is the stock performance impacted?\",\n",
        "    \"And what about AMD?\"\n",
        "]\n",
        "\n",
        "for question in questions:\n",
        "    # Define retriever kwargs\n",
        "    retriever_kwargs = {\n",
        "        \"k\": 5\n",
        "    }\n",
        "\n",
        "    # Retrieve context for the current question from the vector database\n",
        "    retrieved_context = retriever.invoke(\n",
        "      question,\n",
        "      **retriever_kwargs\n",
        "    )\n",
        "\n",
        "    # Define the prompt template\n",
        "    prompt_template = PromptTemplate(\n",
        "        input_variables=[\"context\", \"question\"],\n",
        "        template=template\n",
        "    )\n",
        "\n",
        "    # Create the chain\n",
        "    chain = prompt_template | llm | parser\n",
        "\n",
        "    # Invoke the chain with the context and question\n",
        "    res = chain.invoke({\n",
        "        \"context\": retrieved_context,\n",
        "        \"question\": question\n",
        "    })\n",
        "\n",
        "    print(res)\n",
        "    print(\"-\" * 80)"
      ],
      "metadata": {
        "id": "9m9Gng3OIFA9"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/ahpoise/f7a03db772be8aab76953e2704bf1118/youtube_transcript.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"#@title Setup better response formatting (adds line wrap)\n",
	"from IPython.display import HTML, display\n",
	"\n",
	"def set_css():\n",
	" display(HTML('''\n",
	" <style>\n",
	" pre {\n",
	" white-space: pre-wrap;\n",
	" }\n",
	" </style>\n",
	" '''))\n",
	"get_ipython().events.register('pre_run_cell', set_css)"
	],
	"metadata": {
	"id": "Q7cVGW9TCwkQ",
	"cellView": "form"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"#@title Environment Variables\n",
	"import os\n",
	"from google.colab import userdata\n",
	"\n",
	"os.environ[\"PINECONE_API_KEY\"] = userdata.get('PINECONE_API_KEY')\n",
	"os.environ[\"OPENAI_API_KEY\"] = userdata.get('OPENAI_API_KEY')"
	],
	"metadata": {
	"id": "rQXZ49rLipo6"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "yCeAYzyP-pik"
	},
	"outputs": [],
	"source": [
	"#@title Install deps\n",
	"!pip install langchain -qqq\n",
	"!pip install langchain-core -qqq\n",
	"!pip install langchain_community -qqq\n",
	"!pip install langchain-pinecone -qqq\n",
	"!pip install langchain-text-splitters -qqq\n",
	"!pip install langchain-openai -qqq\n",
	"!pip install youtube_transcript_api -qqq"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"#@title Download an example transcript from a YT video\n",
	"from youtube_transcript_api import YouTubeTranscriptApi\n",
	"\n",
	"srt = YouTubeTranscriptApi.get_transcript(\"L-BX5AjGhlw\") # CHANGE THE ID OF THE VIDEO\n",
	"\n",
	"with open(\"/content/youtube_transcription.txt\", \"a\") as file:\n",
	" for i in srt:\n",
	" file.write(i['text'])"
	],
	"metadata": {
	"id": "1IXZ7_RZ-5eB"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"#@title Initialize Model\n",
	"from google.colab import userdata\n",
	"from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n",
	"\n",
	"model = \"gpt-4o\"\n",
	"\n",
	"llm = ChatOpenAI(\n",
	" model=model,\n",
	")\n",
	"\n",
	"embeddings = OpenAIEmbeddings()"
	],
	"metadata": {
	"id": "VafWv0xkAWO4"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"#@title Load the previously created .txt file\n",
	"from langchain_community.document_loaders import TextLoader\n",
	"\n",
	"loader = TextLoader(\"/content/youtube_transcription.txt\")\n",
	"text_documents = loader.load()\n",
	"\n",
	"print(text_documents)"
	],
	"metadata": {
	"id": "3mDG-ttPCKl6"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"#@title Split the file into chunks\n",
	"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
	"\n",
	"text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)\n",
	"text_documents = text_splitter.split_documents(text_documents)\n",
	"\n",
	"print(text_documents)"
	],
	"metadata": {
	"id": "HttpvtAOCz8R"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"#@title Add the docs to the vector db\n",
	"\n",
	"from pinecone import Pinecone\n",
	"from langchain_pinecone import PineconeVectorStore\n",
	"\n",
	"# Don't forget that you have to have an index already created\n",
	"\n",
	"print(\"Ingesting...\")\n",
	"pc = Pinecone(\n",
	")\n",
	"\n",
	"vector_store = PineconeVectorStore.from_documents(\n",
	" text_documents,\n",
	" embeddings,\n",
	" index_name=\"youtube-transcript\"\n",
	")\n",
	"print(\"Ingestion finished.\")"
	],
	"metadata": {
	"id": "lyD2_1vdHh6Z"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"#@title Clean up\n",
	"\n",
	"import os\n",
	"if os.path.exists(\"/content/youtube_transcription.txt\"):\n",
	" os.remove(\"/content/youtube_transcription.txt\")\n",
	"else:\n",
	" print(\"The file does not exist\")"
	],
	"metadata": {
	"id": "PIzmde6VR-ic"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"#@title Use vector store as a retriever\n",
	"\n",
	"retriever = vector_store.as_retriever()"
	],
	"metadata": {
	"id": "0Zjslun9INSy"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"#@title Instantiate the parser\n",
	"\n",
	"from langchain_core.output_parsers import StrOutputParser\n",
	"\n",
	"parser = StrOutputParser()"
	],
	"metadata": {
	"id": "tyVgqdvYIV0d"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"#@title Prompt Template\n",
	"\n",
	"from langchain.prompts import PromptTemplate\n",
	"\n",
	"template = \"\"\"\n",
	"Act as top companies and stock market analyst. \\\n",
	"Answer the question based on the context below.\\\n",
	"If you can't answer the question, answer with \"I don't know\".\\\n",
	"\n",
	"Context: {context}\n",
	"\n",
	"Question: {question}\n",
	"\"\"\"\n",
	"\n",
	"prompt = PromptTemplate.from_template(template)\n",
	"prompt.format(context=\"Here is some context\", question=\"Here is a question\")"
	],
	"metadata": {
	"id": "pgi400cnH3jH"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"#@title Get information about the video\n",
	"from langchain_core.prompts import PromptTemplate\n",
	"\n",
	"questions = [\n",
	" \"Why are nvidia chips so good?\",\n",
	" \"How is the stock performance impacted?\",\n",
	" \"And what about AMD?\"\n",
	"]\n",
	"\n",
	"for question in questions:\n",
	" # Define retriever kwargs\n",
	" retriever_kwargs = {\n",
	" \"k\": 5\n",
	" }\n",
	"\n",
	" # Retrieve context for the current question from the vector database\n",
	" retrieved_context = retriever.invoke(\n",
	" question,\n",
	" **retriever_kwargs\n",
	" )\n",
	"\n",
	" # Define the prompt template\n",
	" prompt_template = PromptTemplate(\n",
	" input_variables=[\"context\", \"question\"],\n",
	" template=template\n",
	" )\n",
	"\n",
	" # Create the chain\n",
	" chain = prompt_template \| llm \| parser\n",
	"\n",
	" # Invoke the chain with the context and question\n",
	" res = chain.invoke({\n",
	" \"context\": retrieved_context,\n",
	" \"question\": question\n",
	" })\n",
	"\n",
	" print(res)\n",
	" print(\"-\" * 80)"
	],
	"metadata": {
	"id": "9m9Gng3OIFA9"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}