Skip to content

Instantly share code, notes, and snippets.

@suqingdong
Last active April 3, 2023 12:56
Show Gist options
  • Save suqingdong/bcf756910321569fb44302bac52edc48 to your computer and use it in GitHub Desktop.
Save suqingdong/bcf756910321569fb44302bac52edc48 to your computer and use it in GitHub Desktop.
Download subtitle from Youtube Videos
#!/usr/bin/env python
# -*- coding=utf-8 -*-
"""
Download subtitle from YouTube Viedos.
Update: add playlist url parse
"""
import os
import re
import sys
import json
import argparse
import bs4
import requests
def get_response(url):
return requests.get(url, proxies=Proxies, headers=Headers)
def get_sub_url(url):
full_url = '{}?url={}'.format(BaseURL, url)
response = get_response(full_url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
sub_url = BaseURL + soup.select('#show b a')[0].attrs['href']
return sub_url
def get_sub_url_list(url):
sub_url_list = []
response = get_response(url)
# soup = bs4.BeautifulSoup(response.text, 'html.parser')
result = re.findall(r'window\["ytInitialData"\] = ({.*?});', response.text)
data = json.loads(result[0])
playlist = data['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']\
['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']\
[0]['playlistVideoListRenderer']['contents']
for each in playlist:
videoId = each['playlistVideoRenderer']['videoId']
videoURL = 'https://www.youtube.com/watch?v=' + videoId
sub_url_list.append(videoURL)
return sub_url_list
def get_sub_name(sub_url):
sub_name = re.findall(r'title=(.*?)&url', sub_url)[0]
sub_name = sub_name.replace('+', ' ')
sub_name = sub_name.replace('%23', '#')
sub_name += '.srt'
return sub_name
def save_sub(sub_url, outdir):
response = get_response(sub_url)
sub_name = get_sub_name(sub_url)
outpath = os.path.join(outdir, sub_name)
with open(outpath, 'w') as out:
out.write(response.text)
print 'Succesfully download subtitle: "%s"' % outpath
def main(infile, outdir):
if not os.path.exists(outdir):
os.makedirs(outdir)
with open(infile) as f:
for line in f:
url = line.strip()
sub_url_list = [url]
if 'playlist' in url:
print 'This is a playlist url: %s' % url
sub_url_list = get_sub_url_list(url)
for url in sub_url_list:
try:
sub_url = get_sub_url(url)
save_sub(sub_url, outdir)
except:
print 'Could not find a subtitle for url: "%s"' % url
if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('-i', '--infile', help='The URL list file.', required=True)
parser.add_argument('-o', '--outdir', help='The output dir[default: "%(default)s"].', default='subs')
parser.add_argument('-p', '--proxies', help='The Proxies[default: %(default)s].', default='http://127.0.0.1:1088')
args = vars(parser.parse_args())
infile = args.get('infile')
outdir = args.get('outdir')
proxies = args.get('proxies')
BaseURL = 'http://downsub.com/'
Proxies = {
'http': proxies,
'https': proxies
}
Headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
main(infile, outdir)
@raman-r-4978
Copy link

Script is not working. Can you please check?

@D-LUFFY89
Copy link

same problem, is it possible to fix it?

@JoelSjogren
Copy link

You can use this instead.

import os
import datetime
import json
import srt
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled

playlist_id = "PLjQ2gC-5yHEug8_VK8ve0oDSJLoIU4b93"
playlist_url = f"https://youtube.com/playlist?list={playlist_id}"
youtube_dl_cmd = f'youtube-dl -j --flat-playlist "{playlist_url}"'
json_metas = os.popen(youtube_dl_cmd).readlines()

for json_meta in json_metas:
    meta = json.loads(json_meta)
    url = meta['url']
    title = meta['title']
    output_path = f"subs/{title.replace('/', '.')}.srt"

    if os.path.exists(output_path):
        print(f"skipping already downloaded {url}")
        continue

    try:
        sub = YouTubeTranscriptApi.get_transcript(url)
        print(f"subtitles are available for {url}")
        
        sub_frames = []
        for i, s in enumerate(sub):
            sub_frames.append(srt.Subtitle(
                index=i+1,
                start=datetime.timedelta(s['start']),
                end=datetime.timedelta(s['start']+s['duration']),
                content=s['text']
            ))
        sub_srt = srt.compose(sub_frames)
        
        with open(output_path, "w") as sub_file:
            sub_file.write(sub_srt)
            
    except TranscriptsDisabled:
        print(f"subtitles are missing for {url}")
        continue

@kilik128
Copy link

kilik128 commented Apr 3, 2023

Traceback (most recent call last):
File "U:\voice\python\subs\downsubis.py", line 36, in
with open(output_path, "w") as sub_file:
OSError: [Errno 22] Invalid argument: 'subs/Bret and Heather 167th DarkHorse Podcast Livestream: AGI: Where Will it End?.srt'

@JoelSjogren
Copy link

@kilik128
To solve Errno 2 you must create a folder called subs. To solve Errno 22 you must replace characters like ? by ones that are valid in filenames (depends on your OS). It's been a while but I think that's what .replace('/', '.') is doing in the case of the invalid filename character /. So you may append to this part of the code something like .replace('?', '.') or .replace('?', '_') etc.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment