Skip to content

Instantly share code, notes, and snippets.

@avelican
Created April 30, 2023 08:19
Show Gist options
  • Save avelican/4a161089b7391415a092833e7aca6989 to your computer and use it in GitHub Desktop.
Save avelican/4a161089b7391415a092833e7aca6989 to your computer and use it in GitHub Desktop.
Get YouTube transcript (from subtitles / caption file)
import sys
import subprocess
if len(sys.argv) < 2:
print("Please provide a YouTube video URL as the first argument.")
sys.exit(1)
video_url = sys.argv[1]
command = [
"yt-dlp",
"--write-sub",
"--write-auto-sub",
"--sub-lang",
"en.*",
"--skip-download",
video_url,
]
try:
subprocess.run(command, check=True)
print("Command executed successfully.")
except subprocess.CalledProcessError as e:
print(f"Error: {e}")
### convert vtt to txt
import os
import re
import glob
# Find the first file matching the pattern *.vtt
vtt_files = glob.glob('*.vtt')
if vtt_files:
first_vtt_file = vtt_files[0]
output_file = first_vtt_file + '.txt'
# Process the first *.vtt file found
with open(first_vtt_file, 'r') as infile, open(output_file, 'w') as outfile:
print('Saving output to ' + output_file)
seen = set()
for line in infile:
if (
not re.search(':', line) and
line not in seen and
line.strip() != "[Music]" and
line.strip() != "WEBVTT" and
line.strip() != ""
):
seen.add(line)
outfile.write(line)
# Delete all *.vtt files in the current directory
# for vtt_file in vtt_files:
# os.remove(vtt_file)
else:
print("No *.vtt files found in the current directory.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment