ritwikraha/yt-transcript.ipynb

## yt-transcript.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyPhPAYMRz95/kK9H7CkatAx",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/ritwikraha/76ba0856b8c73b62083869140adca6af/yt-transcript.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xjUUWhux7w05",
        "outputId": "f5ec4efb-a5d0-4bc5-8757-86c954374735"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting youtube_transcript_api\n",
            "  Downloading youtube_transcript_api-0.6.1-py3-none-any.whl (24 kB)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from youtube_transcript_api) (2.31.0)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->youtube_transcript_api) (3.3.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->youtube_transcript_api) (3.4)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->youtube_transcript_api) (2.0.7)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->youtube_transcript_api) (2023.7.22)\n",
            "Installing collected packages: youtube_transcript_api\n",
            "Successfully installed youtube_transcript_api-0.6.1\n"
          ]
        }
      ],
      "source": [
        "!pip install youtube_transcript_api"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Importing the YouTubeTranscriptApi module from the youtube_transcript_api package\n",
        "from youtube_transcript_api import YouTubeTranscriptApi"
      ],
      "metadata": {
        "id": "bSTqToCf8jmp"
      },
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import googleapiclient.discovery"
      ],
      "metadata": {
        "id": "THQfl9IzV44p"
      },
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "# Create a YouTube API service object.\n",
        "youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey='API-KEY')\n",
        "\n",
        "# Get the playlist ID.\n",
        "playlist_id = 'PLJicmE8fK0EiFRt1Hm5a_7SJFaikIFW30'\n",
        "\n",
        "# Call the playlistItems.list() method.\n",
        "request = youtube.playlistItems().list(part='snippet', playlistId=playlist_id)\n",
        "\n",
        "# Get the response.\n",
        "response = request.execute()"
      ],
      "metadata": {
        "id": "RjKLxajJWK3b"
      },
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Get the video IDs.\n",
        "video_ids = []\n",
        "for item in response['items']:\n",
        "    video_ids.append(item['snippet']['resourceId']['videoId'])"
      ],
      "metadata": {
        "id": "Yewhvw21XuH1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Print the video IDs.\n",
        "print(video_ids)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "o7_nFEn2VjTc",
        "outputId": "e25fb659-9703-480c-b29e-4f24ce0817ea"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "['N5vJSNXPEwA', '7yDmGnA8Hw0', '98TQv5IAtY8', 'mmkCS5eA4f8', 'LKvjIsyYng8']\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Defining a function to generate a transcript for a given YouTube video ID\n",
        "def generate_transcript(id):\n",
        "    # Retrieving the transcript of the video using the YouTubeTranscriptApi\n",
        "    transcript = YouTubeTranscriptApi.get_transcript(id)\n",
        "\n",
        "    # Initializing an empty string to store the cleaned transcript\n",
        "    script = \"\"\n",
        "\n",
        "    # Iterating through each segment of the transcript\n",
        "    for text in transcript:\n",
        "        # Extracting the text part of the current segment\n",
        "        t = text[\"text\"]\n",
        "\n",
        "        # Ignoring segments labeled as '[Music]'\n",
        "        if t != '[Music]':\n",
        "            # Adding the text segment to the script with a space after each segment\n",
        "            script += t + \" \"\n",
        "\n",
        "    # Returning the script and the number of words in the script\n",
        "    return script, len(script.split())\n"
      ],
      "metadata": {
        "id": "QmqJkacT78_X"
      },
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import csv\n",
        "\n",
        "# Path to the CSV file\n",
        "csv_file_path = 'transcripts.csv'\n",
        "\n",
        "# Open the CSV file in write mode\n",
        "with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:\n",
        "    # Create a CSV writer\n",
        "    writer = csv.writer(file)\n",
        "\n",
        "    # Write the header\n",
        "    writer.writerow(['video_id', 'raw_text'])\n",
        "\n",
        "    # Loop through each video ID\n",
        "    for id in video_ids:\n",
        "        # Call the generate_transcript function and store the output\n",
        "        transcript, no_of_words = generate_transcript(id)\n",
        "\n",
        "        # Write the video ID and transcript to the CSV file\n",
        "        writer.writerow([id, transcript])\n",
        "\n",
        "# Inform the user that the process is complete\n",
        "print(\"CSV file has been created with transcripts.\")\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0bzBtdbGYHVX",
        "outputId": "a591b152-3f62-4a78-ef9f-5e11e2539316"
      },
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "CSV file has been created with transcripts.\n"
          ]
        }
      ]
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"authorship_tag": "ABX9TyPhPAYMRz95/kK9H7CkatAx",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/ritwikraha/76ba0856b8c73b62083869140adca6af/yt-transcript.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "xjUUWhux7w05",
	"outputId": "f5ec4efb-a5d0-4bc5-8757-86c954374735"
	},
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Collecting youtube_transcript_api\n",
	" Downloading youtube_transcript_api-0.6.1-py3-none-any.whl (24 kB)\n",
	"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from youtube_transcript_api) (2.31.0)\n",
	"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->youtube_transcript_api) (3.3.2)\n",
	"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->youtube_transcript_api) (3.4)\n",
	"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->youtube_transcript_api) (2.0.7)\n",
	"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->youtube_transcript_api) (2023.7.22)\n",
	"Installing collected packages: youtube_transcript_api\n",
	"Successfully installed youtube_transcript_api-0.6.1\n"
	]
	}
	],
	"source": [
	"!pip install youtube_transcript_api"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# Importing the YouTubeTranscriptApi module from the youtube_transcript_api package\n",
	"from youtube_transcript_api import YouTubeTranscriptApi"
	],
	"metadata": {
	"id": "bSTqToCf8jmp"
	},
	"execution_count": 2,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import googleapiclient.discovery"
	],
	"metadata": {
	"id": "THQfl9IzV44p"
	},
	"execution_count": 3,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"\n",
	"# Create a YouTube API service object.\n",
	"youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey='API-KEY')\n",
	"\n",
	"# Get the playlist ID.\n",
	"playlist_id = 'PLJicmE8fK0EiFRt1Hm5a_7SJFaikIFW30'\n",
	"\n",
	"# Call the playlistItems.list() method.\n",
	"request = youtube.playlistItems().list(part='snippet', playlistId=playlist_id)\n",
	"\n",
	"# Get the response.\n",
	"response = request.execute()"
	],
	"metadata": {
	"id": "RjKLxajJWK3b"
	},
	"execution_count": 6,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Get the video IDs.\n",
	"video_ids = []\n",
	"for item in response['items']:\n",
	" video_ids.append(item['snippet']['resourceId']['videoId'])"
	],
	"metadata": {
	"id": "Yewhvw21XuH1"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Print the video IDs.\n",
	"print(video_ids)"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "o7_nFEn2VjTc",
	"outputId": "e25fb659-9703-480c-b29e-4f24ce0817ea"
	},
	"execution_count": 7,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"['N5vJSNXPEwA', '7yDmGnA8Hw0', '98TQv5IAtY8', 'mmkCS5eA4f8', 'LKvjIsyYng8']\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# Defining a function to generate a transcript for a given YouTube video ID\n",
	"def generate_transcript(id):\n",
	" # Retrieving the transcript of the video using the YouTubeTranscriptApi\n",
	" transcript = YouTubeTranscriptApi.get_transcript(id)\n",
	"\n",
	" # Initializing an empty string to store the cleaned transcript\n",
	" script = \"\"\n",
	"\n",
	" # Iterating through each segment of the transcript\n",
	" for text in transcript:\n",
	" # Extracting the text part of the current segment\n",
	" t = text[\"text\"]\n",
	"\n",
	" # Ignoring segments labeled as '[Music]'\n",
	" if t != '[Music]':\n",
	" # Adding the text segment to the script with a space after each segment\n",
	" script += t + \" \"\n",
	"\n",
	" # Returning the script and the number of words in the script\n",
	" return script, len(script.split())\n"
	],
	"metadata": {
	"id": "QmqJkacT78_X"
	},
	"execution_count": 12,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import csv\n",
	"\n",
	"# Path to the CSV file\n",
	"csv_file_path = 'transcripts.csv'\n",
	"\n",
	"# Open the CSV file in write mode\n",
	"with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:\n",
	" # Create a CSV writer\n",
	" writer = csv.writer(file)\n",
	"\n",
	" # Write the header\n",
	" writer.writerow(['video_id', 'raw_text'])\n",
	"\n",
	" # Loop through each video ID\n",
	" for id in video_ids:\n",
	" # Call the generate_transcript function and store the output\n",
	" transcript, no_of_words = generate_transcript(id)\n",
	"\n",
	" # Write the video ID and transcript to the CSV file\n",
	" writer.writerow([id, transcript])\n",
	"\n",
	"# Inform the user that the process is complete\n",
	"print(\"CSV file has been created with transcripts.\")\n"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "0bzBtdbGYHVX",
	"outputId": "a591b152-3f62-4a78-ef9f-5e11e2539316"
	},
	"execution_count": 13,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"CSV file has been created with transcripts.\n"
	]
	}
	]
	}
	]
	}