Skip to content

Instantly share code, notes, and snippets.

@d33pfri3d
Created January 23, 2024 18:29
Show Gist options
  • Save d33pfri3d/e1ed6e86dbe86cb611159e6e1deee3ba to your computer and use it in GitHub Desktop.
Save d33pfri3d/e1ed6e86dbe86cb611159e6e1deee3ba to your computer and use it in GitHub Desktop.
Transcript Downloader for Youtuber + Parser
import os
def search_phrase_in_vtt_files(directory, phrase, output_file):
found_files = []
for filename in os.listdir(directory):
if filename.endswith('.vtt'):
with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
if phrase in file.read():
found_files.append(filename)
with open(output_file, 'w', encoding='utf-8') as out_file:
for file in found_files:
print(f'Found in {file}')
out_file.write(file + '\n')
if __name__ == "__main__":
# Directory containing the VTT files
vtt_directory = './transcripts'
# Output file to write the names of files containing the phrase
output_filename = 'found_files.txt'
# Phrase to search for
search_phrase = 'SEARCH_PHRASE'
# Execute the search
search_phrase_in_vtt_files(vtt_directory, search_phrase, output_filename)
print(f"Files containing the phrase '{search_phrase}' have been listed in {output_filename}")
import os
import subprocess
import sys
def install_ytdlp():
subprocess.check_call([sys.executable, "-m", "pip", "install", "yt-dlp"])
def get_video_ids(channel_url):
print('Getting the Video IDS')
command = f"yt-dlp --get-id {channel_url}"
result = subprocess.run(command, shell=True, capture_output=True, text=True)
if result.returncode != 0:
print(f"Error fetching video IDs: {result.stderr}")
sys.exit(1)
return result.stdout.split()
def download_transcripts(video_ids):
for video_id in video_ids:
print(f'Downloading Transcript for {video_id}')
command = f"yt-dlp --skip-download --write-auto-sub --sub-lang en https://www.youtube.com/watch?v={video_id}"
subprocess.run(command, shell=True)
if __name__ == "__main__":
# # Install yt-dlp if not already installed
# install_ytdlp()
# Replace with the actual YouTube channel URL
channel_url = "https://www.youtube.com/@CHANNEL/videos"
# Get all video IDs from the channel
video_ids = get_video_ids(channel_url)
# Download transcripts of all videos
download_transcripts(video_ids)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment