Last active
April 3, 2023 12:56
-
-
Save suqingdong/bcf756910321569fb44302bac52edc48 to your computer and use it in GitHub Desktop.
Download subtitle from Youtube Videos
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding=utf-8 -*- | |
""" | |
Download subtitle from YouTube Viedos. | |
Update: add playlist url parse | |
""" | |
import os | |
import re | |
import sys | |
import json | |
import argparse | |
import bs4 | |
import requests | |
def get_response(url): | |
return requests.get(url, proxies=Proxies, headers=Headers) | |
def get_sub_url(url): | |
full_url = '{}?url={}'.format(BaseURL, url) | |
response = get_response(full_url) | |
soup = bs4.BeautifulSoup(response.text, 'html.parser') | |
sub_url = BaseURL + soup.select('#show b a')[0].attrs['href'] | |
return sub_url | |
def get_sub_url_list(url): | |
sub_url_list = [] | |
response = get_response(url) | |
# soup = bs4.BeautifulSoup(response.text, 'html.parser') | |
result = re.findall(r'window\["ytInitialData"\] = ({.*?});', response.text) | |
data = json.loads(result[0]) | |
playlist = data['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']\ | |
['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']\ | |
[0]['playlistVideoListRenderer']['contents'] | |
for each in playlist: | |
videoId = each['playlistVideoRenderer']['videoId'] | |
videoURL = 'https://www.youtube.com/watch?v=' + videoId | |
sub_url_list.append(videoURL) | |
return sub_url_list | |
def get_sub_name(sub_url): | |
sub_name = re.findall(r'title=(.*?)&url', sub_url)[0] | |
sub_name = sub_name.replace('+', ' ') | |
sub_name = sub_name.replace('%23', '#') | |
sub_name += '.srt' | |
return sub_name | |
def save_sub(sub_url, outdir): | |
response = get_response(sub_url) | |
sub_name = get_sub_name(sub_url) | |
outpath = os.path.join(outdir, sub_name) | |
with open(outpath, 'w') as out: | |
out.write(response.text) | |
print 'Succesfully download subtitle: "%s"' % outpath | |
def main(infile, outdir): | |
if not os.path.exists(outdir): | |
os.makedirs(outdir) | |
with open(infile) as f: | |
for line in f: | |
url = line.strip() | |
sub_url_list = [url] | |
if 'playlist' in url: | |
print 'This is a playlist url: %s' % url | |
sub_url_list = get_sub_url_list(url) | |
for url in sub_url_list: | |
try: | |
sub_url = get_sub_url(url) | |
save_sub(sub_url, outdir) | |
except: | |
print 'Could not find a subtitle for url: "%s"' % url | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) | |
parser.add_argument('-i', '--infile', help='The URL list file.', required=True) | |
parser.add_argument('-o', '--outdir', help='The output dir[default: "%(default)s"].', default='subs') | |
parser.add_argument('-p', '--proxies', help='The Proxies[default: %(default)s].', default='http://127.0.0.1:1088') | |
args = vars(parser.parse_args()) | |
infile = args.get('infile') | |
outdir = args.get('outdir') | |
proxies = args.get('proxies') | |
BaseURL = 'http://downsub.com/' | |
Proxies = { | |
'http': proxies, | |
'https': proxies | |
} | |
Headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' | |
} | |
main(infile, outdir) |
same problem, is it possible to fix it?
You can use this instead.
import os
import datetime
import json
import srt
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
playlist_id = "PLjQ2gC-5yHEug8_VK8ve0oDSJLoIU4b93"
playlist_url = f"https://youtube.com/playlist?list={playlist_id}"
youtube_dl_cmd = f'youtube-dl -j --flat-playlist "{playlist_url}"'
json_metas = os.popen(youtube_dl_cmd).readlines()
for json_meta in json_metas:
meta = json.loads(json_meta)
url = meta['url']
title = meta['title']
output_path = f"subs/{title.replace('/', '.')}.srt"
if os.path.exists(output_path):
print(f"skipping already downloaded {url}")
continue
try:
sub = YouTubeTranscriptApi.get_transcript(url)
print(f"subtitles are available for {url}")
sub_frames = []
for i, s in enumerate(sub):
sub_frames.append(srt.Subtitle(
index=i+1,
start=datetime.timedelta(s['start']),
end=datetime.timedelta(s['start']+s['duration']),
content=s['text']
))
sub_srt = srt.compose(sub_frames)
with open(output_path, "w") as sub_file:
sub_file.write(sub_srt)
except TranscriptsDisabled:
print(f"subtitles are missing for {url}")
continue
Traceback (most recent call last):
File "U:\voice\python\subs\downsubis.py", line 36, in
with open(output_path, "w") as sub_file:
OSError: [Errno 22] Invalid argument: 'subs/Bret and Heather 167th DarkHorse Podcast Livestream: AGI: Where Will it End?.srt'
@kilik128
To solve Errno 2 you must create a folder called subs
. To solve Errno 22 you must replace characters like ?
by ones that are valid in filenames (depends on your OS). It's been a while but I think that's what .replace('/', '.')
is doing in the case of the invalid filename character /
. So you may append to this part of the code something like .replace('?', '.')
or .replace('?', '_')
etc.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Script is not working. Can you please check?