Skip to content

Instantly share code, notes, and snippets.

@ahpoise
Last active May 31, 2025 17:55
Show Gist options
  • Save ahpoise/f7a03db772be8aab76953e2704bf1118 to your computer and use it in GitHub Desktop.
Save ahpoise/f7a03db772be8aab76953e2704bf1118 to your computer and use it in GitHub Desktop.
youtube_transcript.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/ahpoise/f7a03db772be8aab76953e2704bf1118/youtube_transcript.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"#@title Setup better response formatting (adds line wrap)\n",
"from IPython.display import HTML, display\n",
"\n",
"def set_css():\n",
" display(HTML('''\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" '''))\n",
"get_ipython().events.register('pre_run_cell', set_css)"
],
"metadata": {
"id": "Q7cVGW9TCwkQ",
"cellView": "form"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Environment Variables\n",
"import os\n",
"from google.colab import userdata\n",
"\n",
"os.environ[\"PINECONE_API_KEY\"] = userdata.get('PINECONE_API_KEY')\n",
"os.environ[\"OPENAI_API_KEY\"] = userdata.get('OPENAI_API_KEY')"
],
"metadata": {
"id": "rQXZ49rLipo6"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "yCeAYzyP-pik"
},
"outputs": [],
"source": [
"#@title Install deps\n",
"!pip install langchain -qqq\n",
"!pip install langchain-core -qqq\n",
"!pip install langchain_community -qqq\n",
"!pip install langchain-pinecone -qqq\n",
"!pip install langchain-text-splitters -qqq\n",
"!pip install langchain-openai -qqq\n",
"!pip install youtube_transcript_api -qqq"
]
},
{
"cell_type": "code",
"source": [
"#@title Download an example transcript from a YT video\n",
"from youtube_transcript_api import YouTubeTranscriptApi\n",
"\n",
"srt = YouTubeTranscriptApi.get_transcript(\"L-BX5AjGhlw\") # CHANGE THE ID OF THE VIDEO\n",
"\n",
"with open(\"/content/youtube_transcription.txt\", \"a\") as file:\n",
" for i in srt:\n",
" file.write(i['text'])"
],
"metadata": {
"id": "1IXZ7_RZ-5eB"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Initialize Model\n",
"from google.colab import userdata\n",
"from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n",
"\n",
"model = \"gpt-4o\"\n",
"\n",
"llm = ChatOpenAI(\n",
" model=model,\n",
")\n",
"\n",
"embeddings = OpenAIEmbeddings()"
],
"metadata": {
"id": "VafWv0xkAWO4"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Load the previously created .txt file\n",
"from langchain_community.document_loaders import TextLoader\n",
"\n",
"loader = TextLoader(\"/content/youtube_transcription.txt\")\n",
"text_documents = loader.load()\n",
"\n",
"print(text_documents)"
],
"metadata": {
"id": "3mDG-ttPCKl6"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Split the file into chunks\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"\n",
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)\n",
"text_documents = text_splitter.split_documents(text_documents)\n",
"\n",
"print(text_documents)"
],
"metadata": {
"id": "HttpvtAOCz8R"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Add the docs to the vector db\n",
"\n",
"from pinecone import Pinecone\n",
"from langchain_pinecone import PineconeVectorStore\n",
"\n",
"# Don't forget that you have to have an index already created\n",
"\n",
"print(\"Ingesting...\")\n",
"pc = Pinecone(\n",
")\n",
"\n",
"vector_store = PineconeVectorStore.from_documents(\n",
" text_documents,\n",
" embeddings,\n",
" index_name=\"youtube-transcript\"\n",
")\n",
"print(\"Ingestion finished.\")"
],
"metadata": {
"id": "lyD2_1vdHh6Z"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Clean up\n",
"\n",
"import os\n",
"if os.path.exists(\"/content/youtube_transcription.txt\"):\n",
" os.remove(\"/content/youtube_transcription.txt\")\n",
"else:\n",
" print(\"The file does not exist\")"
],
"metadata": {
"id": "PIzmde6VR-ic"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Use vector store as a retriever\n",
"\n",
"retriever = vector_store.as_retriever()"
],
"metadata": {
"id": "0Zjslun9INSy"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Instantiate the parser\n",
"\n",
"from langchain_core.output_parsers import StrOutputParser\n",
"\n",
"parser = StrOutputParser()"
],
"metadata": {
"id": "tyVgqdvYIV0d"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Prompt Template\n",
"\n",
"from langchain.prompts import PromptTemplate\n",
"\n",
"template = \"\"\"\n",
"Act as top companies and stock market analyst. \\\n",
"Answer the question based on the context below.\\\n",
"If you can't answer the question, answer with \"I don't know\".\\\n",
"\n",
"Context: {context}\n",
"\n",
"Question: {question}\n",
"\"\"\"\n",
"\n",
"prompt = PromptTemplate.from_template(template)\n",
"prompt.format(context=\"Here is some context\", question=\"Here is a question\")"
],
"metadata": {
"id": "pgi400cnH3jH"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Get information about the video\n",
"from langchain_core.prompts import PromptTemplate\n",
"\n",
"questions = [\n",
" \"Why are nvidia chips so good?\",\n",
" \"How is the stock performance impacted?\",\n",
" \"And what about AMD?\"\n",
"]\n",
"\n",
"for question in questions:\n",
" # Define retriever kwargs\n",
" retriever_kwargs = {\n",
" \"k\": 5\n",
" }\n",
"\n",
" # Retrieve context for the current question from the vector database\n",
" retrieved_context = retriever.invoke(\n",
" question,\n",
" **retriever_kwargs\n",
" )\n",
"\n",
" # Define the prompt template\n",
" prompt_template = PromptTemplate(\n",
" input_variables=[\"context\", \"question\"],\n",
" template=template\n",
" )\n",
"\n",
" # Create the chain\n",
" chain = prompt_template | llm | parser\n",
"\n",
" # Invoke the chain with the context and question\n",
" res = chain.invoke({\n",
" \"context\": retrieved_context,\n",
" \"question\": question\n",
" })\n",
"\n",
" print(res)\n",
" print(\"-\" * 80)"
],
"metadata": {
"id": "9m9Gng3OIFA9"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment