Skip to content

Instantly share code, notes, and snippets.

Last active April 3, 2023 12:56
Show Gist options
  • Save suqingdong/bcf756910321569fb44302bac52edc48 to your computer and use it in GitHub Desktop.
Save suqingdong/bcf756910321569fb44302bac52edc48 to your computer and use it in GitHub Desktop.
Download subtitle from Youtube Videos
#!/usr/bin/env python
# -*- coding=utf-8 -*-
Download subtitle from YouTube Viedos.
Update: add playlist url parse
import os
import re
import sys
import json
import argparse
import bs4
import requests
def get_response(url):
return requests.get(url, proxies=Proxies, headers=Headers)
def get_sub_url(url):
full_url = '{}?url={}'.format(BaseURL, url)
response = get_response(full_url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
sub_url = BaseURL +'#show b a')[0].attrs['href']
return sub_url
def get_sub_url_list(url):
sub_url_list = []
response = get_response(url)
# soup = bs4.BeautifulSoup(response.text, 'html.parser')
result = re.findall(r'window\["ytInitialData"\] = ({.*?});', response.text)
data = json.loads(result[0])
playlist = data['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']\
for each in playlist:
videoId = each['playlistVideoRenderer']['videoId']
videoURL = '' + videoId
return sub_url_list
def get_sub_name(sub_url):
sub_name = re.findall(r'title=(.*?)&url', sub_url)[0]
sub_name = sub_name.replace('+', ' ')
sub_name = sub_name.replace('%23', '#')
sub_name += '.srt'
return sub_name
def save_sub(sub_url, outdir):
response = get_response(sub_url)
sub_name = get_sub_name(sub_url)
outpath = os.path.join(outdir, sub_name)
with open(outpath, 'w') as out:
print 'Succesfully download subtitle: "%s"' % outpath
def main(infile, outdir):
if not os.path.exists(outdir):
with open(infile) as f:
for line in f:
url = line.strip()
sub_url_list = [url]
if 'playlist' in url:
print 'This is a playlist url: %s' % url
sub_url_list = get_sub_url_list(url)
for url in sub_url_list:
sub_url = get_sub_url(url)
save_sub(sub_url, outdir)
print 'Could not find a subtitle for url: "%s"' % url
if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('-i', '--infile', help='The URL list file.', required=True)
parser.add_argument('-o', '--outdir', help='The output dir[default: "%(default)s"].', default='subs')
parser.add_argument('-p', '--proxies', help='The Proxies[default: %(default)s].', default='')
args = vars(parser.parse_args())
infile = args.get('infile')
outdir = args.get('outdir')
proxies = args.get('proxies')
BaseURL = ''
Proxies = {
'http': proxies,
'https': proxies
Headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
main(infile, outdir)
Copy link

Script is not working. Can you please check?

Copy link

same problem, is it possible to fix it?

Copy link

You can use this instead.

import os
import datetime
import json
import srt
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled

playlist_id = "PLjQ2gC-5yHEug8_VK8ve0oDSJLoIU4b93"
playlist_url = f"{playlist_id}"
youtube_dl_cmd = f'youtube-dl -j --flat-playlist "{playlist_url}"'
json_metas = os.popen(youtube_dl_cmd).readlines()

for json_meta in json_metas:
    meta = json.loads(json_meta)
    url = meta['url']
    title = meta['title']
    output_path = f"subs/{title.replace('/', '.')}.srt"

    if os.path.exists(output_path):
        print(f"skipping already downloaded {url}")

        sub = YouTubeTranscriptApi.get_transcript(url)
        print(f"subtitles are available for {url}")
        sub_frames = []
        for i, s in enumerate(sub):
        sub_srt = srt.compose(sub_frames)
        with open(output_path, "w") as sub_file:
    except TranscriptsDisabled:
        print(f"subtitles are missing for {url}")

Copy link

kilik128 commented Apr 3, 2023

Traceback (most recent call last):
File "U:\voice\python\subs\", line 36, in
with open(output_path, "w") as sub_file:
OSError: [Errno 22] Invalid argument: 'subs/Bret and Heather 167th DarkHorse Podcast Livestream: AGI: Where Will it End?.srt'

Copy link

To solve Errno 2 you must create a folder called subs. To solve Errno 22 you must replace characters like ? by ones that are valid in filenames (depends on your OS). It's been a while but I think that's what .replace('/', '.') is doing in the case of the invalid filename character /. So you may append to this part of the code something like .replace('?', '.') or .replace('?', '_') etc.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment