Last active
May 31, 2025 17:55
-
-
Save ahpoise/f7a03db772be8aab76953e2704bf1118 to your computer and use it in GitHub Desktop.
youtube_transcript.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/ahpoise/f7a03db772be8aab76953e2704bf1118/youtube_transcript.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title Setup better response formatting (adds line wrap)\n", | |
"from IPython.display import HTML, display\n", | |
"\n", | |
"def set_css():\n", | |
" display(HTML('''\n", | |
" <style>\n", | |
" pre {\n", | |
" white-space: pre-wrap;\n", | |
" }\n", | |
" </style>\n", | |
" '''))\n", | |
"get_ipython().events.register('pre_run_cell', set_css)" | |
], | |
"metadata": { | |
"id": "Q7cVGW9TCwkQ", | |
"cellView": "form" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title Environment Variables\n", | |
"import os\n", | |
"from google.colab import userdata\n", | |
"\n", | |
"os.environ[\"PINECONE_API_KEY\"] = userdata.get('PINECONE_API_KEY')\n", | |
"os.environ[\"OPENAI_API_KEY\"] = userdata.get('OPENAI_API_KEY')" | |
], | |
"metadata": { | |
"id": "rQXZ49rLipo6" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "yCeAYzyP-pik" | |
}, | |
"outputs": [], | |
"source": [ | |
"#@title Install deps\n", | |
"!pip install langchain -qqq\n", | |
"!pip install langchain-core -qqq\n", | |
"!pip install langchain_community -qqq\n", | |
"!pip install langchain-pinecone -qqq\n", | |
"!pip install langchain-text-splitters -qqq\n", | |
"!pip install langchain-openai -qqq\n", | |
"!pip install youtube_transcript_api -qqq" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title Download an example transcript from a YT video\n", | |
"from youtube_transcript_api import YouTubeTranscriptApi\n", | |
"\n", | |
"srt = YouTubeTranscriptApi.get_transcript(\"L-BX5AjGhlw\") # CHANGE THE ID OF THE VIDEO\n", | |
"\n", | |
"with open(\"/content/youtube_transcription.txt\", \"a\") as file:\n", | |
" for i in srt:\n", | |
" file.write(i['text'])" | |
], | |
"metadata": { | |
"id": "1IXZ7_RZ-5eB" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title Initialize Model\n", | |
"from google.colab import userdata\n", | |
"from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n", | |
"\n", | |
"model = \"gpt-4o\"\n", | |
"\n", | |
"llm = ChatOpenAI(\n", | |
" model=model,\n", | |
")\n", | |
"\n", | |
"embeddings = OpenAIEmbeddings()" | |
], | |
"metadata": { | |
"id": "VafWv0xkAWO4" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title Load the previously created .txt file\n", | |
"from langchain_community.document_loaders import TextLoader\n", | |
"\n", | |
"loader = TextLoader(\"/content/youtube_transcription.txt\")\n", | |
"text_documents = loader.load()\n", | |
"\n", | |
"print(text_documents)" | |
], | |
"metadata": { | |
"id": "3mDG-ttPCKl6" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title Split the file into chunks\n", | |
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n", | |
"\n", | |
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)\n", | |
"text_documents = text_splitter.split_documents(text_documents)\n", | |
"\n", | |
"print(text_documents)" | |
], | |
"metadata": { | |
"id": "HttpvtAOCz8R" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title Add the docs to the vector db\n", | |
"\n", | |
"from pinecone import Pinecone\n", | |
"from langchain_pinecone import PineconeVectorStore\n", | |
"\n", | |
"# Don't forget that you have to have an index already created\n", | |
"\n", | |
"print(\"Ingesting...\")\n", | |
"pc = Pinecone(\n", | |
")\n", | |
"\n", | |
"vector_store = PineconeVectorStore.from_documents(\n", | |
" text_documents,\n", | |
" embeddings,\n", | |
" index_name=\"youtube-transcript\"\n", | |
")\n", | |
"print(\"Ingestion finished.\")" | |
], | |
"metadata": { | |
"id": "lyD2_1vdHh6Z" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title Clean up\n", | |
"\n", | |
"import os\n", | |
"if os.path.exists(\"/content/youtube_transcription.txt\"):\n", | |
" os.remove(\"/content/youtube_transcription.txt\")\n", | |
"else:\n", | |
" print(\"The file does not exist\")" | |
], | |
"metadata": { | |
"id": "PIzmde6VR-ic" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title Use vector store as a retriever\n", | |
"\n", | |
"retriever = vector_store.as_retriever()" | |
], | |
"metadata": { | |
"id": "0Zjslun9INSy" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title Instantiate the parser\n", | |
"\n", | |
"from langchain_core.output_parsers import StrOutputParser\n", | |
"\n", | |
"parser = StrOutputParser()" | |
], | |
"metadata": { | |
"id": "tyVgqdvYIV0d" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title Prompt Template\n", | |
"\n", | |
"from langchain.prompts import PromptTemplate\n", | |
"\n", | |
"template = \"\"\"\n", | |
"Act as top companies and stock market analyst. \\\n", | |
"Answer the question based on the context below.\\\n", | |
"If you can't answer the question, answer with \"I don't know\".\\\n", | |
"\n", | |
"Context: {context}\n", | |
"\n", | |
"Question: {question}\n", | |
"\"\"\"\n", | |
"\n", | |
"prompt = PromptTemplate.from_template(template)\n", | |
"prompt.format(context=\"Here is some context\", question=\"Here is a question\")" | |
], | |
"metadata": { | |
"id": "pgi400cnH3jH" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title Get information about the video\n", | |
"from langchain_core.prompts import PromptTemplate\n", | |
"\n", | |
"questions = [\n", | |
" \"Why are nvidia chips so good?\",\n", | |
" \"How is the stock performance impacted?\",\n", | |
" \"And what about AMD?\"\n", | |
"]\n", | |
"\n", | |
"for question in questions:\n", | |
" # Define retriever kwargs\n", | |
" retriever_kwargs = {\n", | |
" \"k\": 5\n", | |
" }\n", | |
"\n", | |
" # Retrieve context for the current question from the vector database\n", | |
" retrieved_context = retriever.invoke(\n", | |
" question,\n", | |
" **retriever_kwargs\n", | |
" )\n", | |
"\n", | |
" # Define the prompt template\n", | |
" prompt_template = PromptTemplate(\n", | |
" input_variables=[\"context\", \"question\"],\n", | |
" template=template\n", | |
" )\n", | |
"\n", | |
" # Create the chain\n", | |
" chain = prompt_template | llm | parser\n", | |
"\n", | |
" # Invoke the chain with the context and question\n", | |
" res = chain.invoke({\n", | |
" \"context\": retrieved_context,\n", | |
" \"question\": question\n", | |
" })\n", | |
"\n", | |
" print(res)\n", | |
" print(\"-\" * 80)" | |
], | |
"metadata": { | |
"id": "9m9Gng3OIFA9" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment