Last active
February 1, 2022 14:52
-
-
Save cwverhey/2cfeb5ce8603a626bd62de321a382214 to your computer and use it in GitHub Desktop.
OpenSubtitles.org subtitle search and downloading, with automatic unzipping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import re | |
from requests import Session | |
from argparse import ArgumentParser | |
from zipfile import ZipFile | |
from tempfile import TemporaryDirectory | |
try: | |
from cv2 import VideoCapture, CAP_PROP_FPS | |
except ModuleNotFoundError: | |
print('\033[1;97;101mWarning\033[0m Unable to import cv2. Will not be able to determine FPS for --file.') | |
class OSDownloader: | |
def __init__(self, lang, max_movies, max_subs_per_movie): | |
''' | |
lang (str) language, eg 'eng' or 'eng,dut,ger' | |
max_movies (int) maximum number of movies to fetch from search results | |
max_subs_per_movie (int) max number of subtitles to fetch per movie | |
''' | |
# setup session | |
self.session = Session() | |
self.session.headers.update({"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36", | |
"DNT": "1", | |
"Referer": "https://www.opensubtitles.org/en" | |
}) | |
# create tempdir | |
self.tempdir = TemporaryDirectory(prefix='opensubtitles.org-') | |
# store settings | |
self.lang = lang.split(',') | |
self.max_movies = max_movies | |
self.max_subs_per_movie = max_subs_per_movie | |
def search_file(self, path): | |
'''Get subtitle IDs by filename and FPS search. Input: file path. Output: list of list of subtitle IDs per movie''' | |
name = self.get_name(path) | |
fps = self.get_fps(path) | |
params = {'MovieName': name, 'id': 8, 'action': 'search', | |
'SubLanguageID': self.lang, 'Season': '', 'Episode': '', | |
'SubSumCD': '', 'Genre': '', 'MovieByteSize': '', | |
'MovieLanguage': '', 'MovieImdbRatingSign': '', | |
'MovieImdbRating': '', 'MovieCountry': '', | |
'MovieYearSign': 1, 'MovieYear': '', 'MovieFPS': fps, | |
'SubFormat': '', 'SubAddDate': '', 'Uploader': '', | |
'IDUser': '', 'Translator': '', 'IMDBID': '', | |
'MovieHash': '', 'IDMovie': '' | |
} | |
return self.search(params) | |
def get_name(self,path): | |
'''Returns filename of movie without extension''' | |
name = os.path.splitext(os.path.basename(path))[0] | |
return name | |
def get_fps(self,path): | |
'''Returns movie FPS as string, if it is in the OpenSubtitles list of FPS options''' | |
try: | |
movie = VideoCapture(path) | |
fps = movie.get(CAP_PROP_FPS) | |
except NameError: | |
return '' | |
options = {'23.976': 23.976, '23.980': 23.980, '24.000': 24.000, | |
'25.000': 25.000, '29.970': 29.970, '30.000': 30.000, | |
'50.000': 50.000, '59.940': 59.940, '60.000': 60.000} | |
for key,val in options.items(): | |
if(abs(fps - val) < 0.002): | |
return key | |
return '' | |
def search_title(self, title): | |
'''Get subtitle IDs by movie title and year search. Input: string. A 4-digit year is taken from the string automatically. Output: list of list of subtitle IDs''' | |
search_year = re.search(r'\b(\d{4})\b',title) | |
if search_year: | |
year = search_year.group() | |
title = title.replace(year,'') | |
else: | |
year = '' | |
params = {'MovieName': title, 'id': 8, 'action': 'search', | |
'SubLanguageID': self.lang, 'Season': '', 'Episode': '', | |
'SubSumCD': '', 'Genre': '', 'MovieByteSize': '', | |
'MovieLanguage': '', 'MovieImdbRatingSign': '', | |
'MovieImdbRating': '', 'MovieCountry': '', | |
'MovieYearSign': 1, 'MovieYear': year, 'MovieFPS': '', | |
'SubFormat': '', 'SubAddDate': '', 'Uploader': '', | |
'IDUser': '', 'Translator': '', 'IMDBID': '', | |
'MovieHash': '', 'IDMovie': '' | |
} | |
return self.search(params) | |
def search_imdb_id(self, imdb_id): | |
'''Get subtitle IDs by IMDB movie ID. Output: list of list of subtitle IDs''' | |
params = {'MovieName': '', 'id': 8, 'action': 'search', | |
'SubLanguageID': self.lang, 'Season': '', 'Episode': '', | |
'SubSumCD': '', 'Genre': '', 'MovieByteSize': '', | |
'MovieLanguage': '', 'MovieImdbRatingSign': '', | |
'MovieImdbRating': '', 'MovieCountry': '', | |
'MovieYearSign': 1, 'MovieYear': '', 'MovieFPS': '', | |
'SubFormat': '', 'SubAddDate': '', 'Uploader': '', | |
'IDUser': '', 'Translator': '', 'IMDBID': imdb_id, | |
'MovieHash': '', 'IDMovie': '' | |
} | |
return self.search(params) | |
def get_os_mid(self, os_mid): | |
'''Get subtitle IDs by OpenSubtitles movie ID. Output: list of list of subtitle IDs''' | |
lang = ','.join(self.lang) | |
url = f'https://www.opensubtitles.org/en/search/sublanguageid-{lang}/idmovie-{os_mid}' | |
subtitles = [self.get_os_movieurl(url)] | |
return subtitles | |
def search(self, params): | |
'''Performs search for subtitles on opensubtitles.org, input: search parameters, output: list of list of subtitle IDs per movie''' | |
url = 'https://www.opensubtitles.org/en/search2' | |
r = self.session.get(url=url, params=params) | |
if(re.search('http://schema.org/Movie', r.text)): | |
# result is a single movie page | |
return [self.list_subtitles(r.text)] | |
else: | |
# result is a list of movies | |
movieurls = re.findall('<a class="bnone" title=".*?" href="(.*?)">(.*?)</a>', r.text, re.DOTALL) | |
print('Found {} movies\n'.format(len(movieurls))) | |
subtitles = [] | |
for i,(url,txt) in enumerate(movieurls): | |
if i == self.max_movies: break | |
print("{}\nhttps://www.opensubtitles.org{}".format(txt.replace("\n"," "),url)) | |
subtitles.append(self.get_os_movieurl("https://www.opensubtitles.org"+url)) | |
return subtitles | |
def get_os_movieurl(self, movieurl): | |
'''Lists subtitles on a single movie results page, input: URL''' | |
r = self.session.get(url=movieurl) | |
return self.list_subtitles(r.text) | |
def list_subtitles(self, content): | |
'''Lists subtitles on a single movie results page, input: content string''' | |
subtitles = re.findall('id="main.*?<br />(.*?)<br />.*?<a href="/en/subtitleserve/sub/(\d+)".*?</tr>', content, re.DOTALL) | |
print('{} subtitles'.format(len(subtitles))) | |
for t,n in subtitles: | |
span = re.search('(.*?)<span title="(.*?)">.*?</span>(.*?)', t, re.DOTALL) | |
if span: | |
t = span.group(1) + span.group(2) + span.group(3) | |
t = t.replace("\n"," ").strip() | |
print(f"- {n}: {t}") | |
print() | |
return [sub[1] for sub in subtitles] | |
def download_subtitles(self, sub_ids, targetdir, try_filename = False): | |
'''Downloads subtitles with download_subtitle() from list of movies, one list of sub_ids per movie''' | |
for movie in range(0, len(sub_ids)): | |
for sub in range(0, min(self.max_subs_per_movie, len(sub_ids[movie]))): | |
self.download_subtitle(sub_id=sub_ids[movie][sub], targetdir=targetdir, try_filename=try_filename) | |
def download_subtitle(self, sub_id, targetdir, try_filename = False): | |
'''Downloads subtitles zip and extracts, input: subtitle ID''' | |
print(f"download_subtitle({sub_id})") | |
# zip file download | |
r = self.session.get(url=f"https://www.opensubtitles.org/en/subtitleserve/sub/{sub_id}") | |
if r.status_code != 200: | |
raise RuntimeError(f"download_subtitle({sub_id}) status code: {r.status_code}") | |
# save zip to temp file | |
zipfile = os.path.join(self.tempdir.name,f"download_{sub_id}.zip") | |
with open(zipfile, "wb") as zipfh: | |
for chunk in r.iter_content(): | |
if chunk: zipfh.write(chunk) # filter out keep-alive chunks | |
print(f"saved compressed archive to {zipfile}") | |
# create target dir and cd to it | |
if not os.path.exists(targetdir): | |
print(f"creating {targetdir}") | |
os.makedirs(targetdir) | |
os.chdir(targetdir) | |
# extract zip | |
print('extracting to {}:'.format(os.getcwd())) | |
with ZipFile(zipfile, 'r') as zipObj: | |
for filename in zipObj.namelist(): | |
if filename.endswith('.nfo'): | |
print(f'[SKIP] {filename}') | |
else: | |
if(os.path.exists(filename)): | |
print(f'[SKIP] {filename} (exists)') | |
else: | |
print(f'\033[1;32;40m[ OK ]\033[0m {filename}') | |
zipObj.extract(filename) | |
if try_filename is not False: | |
target_filename = try_filename + os.path.splitext(filename)[1] | |
print(filename, try_filename, target_filename) | |
if not os.path.exists(target_filename): | |
print(f' -> {target_filename}') | |
os.rename(filename, target_filename) | |
print() | |
# def printreq(r,s): | |
# print('--------------------------------------------------------------\n') | |
# print(r.url) | |
# print('\n==REQUEST===========================') | |
# print(r.request.headers) | |
# print('==RESPONSE==========================') | |
# print(r.status_code) | |
# print(r.headers) | |
# print(r.text[:100]) | |
# print('==COOKIES===========================') | |
# print(s.cookies.get_dict()) | |
# print('--------------------------------------------------------------') | |
if __name__ == "__main__": | |
# parse arguments | |
parser = ArgumentParser(description='Download subtitles from OpenSubtitles.org, search either for a file (extracts filename and FPS), the movie\'s title, the movie\'s IMDB ID or its OpenSubtitles ID. One of --file, --title, --imdb, --osm or --oss must be supplied.') | |
parser.add_argument('--file', '-f', help='movie file', metavar='FILE') | |
parser.add_argument('--title', '-t', help='(partial) movie title, you can add the year as 4 digits eg. \'Matrix 2021\'', metavar='TIT') | |
parser.add_argument('--imdb', '-i', help='IMDB ID (eg. \'10838180\' in https://www.imdb.com/title/tt10838180/)', metavar='ID') | |
parser.add_argument('--osm', '-m', help='OpenSubtitles movie ID (eg. \'915058\' in https://www.opensubtitles.org/en/search/sublanguageid-eng/idmovie-915058)', metavar='ID') | |
parser.add_argument('--oss', '-s', help='OpenSubtitles subtitle ID (eg. \'8917474\' in https://www.opensubtitles.org/en/subtitleserve/sub/8917474)', metavar='ID') | |
parser.add_argument('--lang', '-l', default='eng', help='subtitle language(s) to search/download (eg: \'eng,ger,fra\', default: \'eng\')', metavar='LANG') | |
parser.add_argument('--dir', '-d', help='folder to download subs into (relative to default), defaults to movie dir for --file, or current dir for other options', metavar='PATH') | |
parser.add_argument('--exactname', '-e', action='store_true', help='name the first downloaded subtitle identical to the movie (only works with --file)') | |
parser.add_argument('--movies', default=4, help='max. number of movie results to download subs for (default: 4)', metavar='NUM') | |
parser.add_argument('--subs', default=4, help='max. number of subtitles to download per movie (default: 4)', metavar='NUM') | |
parser.add_argument('-n', help='shorthand for --movies NUM --subs NUM', metavar='NUM') | |
args, unknown = parser.parse_known_args() | |
# clean arguments | |
def clean_bracketed_paste(str_in): | |
if not isinstance(str_in, str): | |
return(str_in) | |
return re.sub(r'^\d\d~|\d\d~$', '', str_in) | |
args = {k:clean_bracketed_paste(v) for k,v in vars(args).items()} | |
unknown = [clean_bracketed_paste(v) for v in unknown] | |
unknown = [v for v in unknown if v != ''] | |
# handle missing arguments | |
if args['file'] is None and args['title'] is None and args['imdb'] is None and args['osm'] is None and args['oss'] is None: | |
if len(unknown) > 0: | |
# unknown argument becomes file argument | |
args['file'] = unknown[0] | |
else: | |
# show help and exit | |
parser.print_help() | |
raise SystemExit(1) | |
# apply `n` argument | |
if args['n'] is not None: | |
args['movies'] = args['n'] | |
args['subs'] = args['n'] | |
# figure out absolute path to target directory | |
if args['dir'] is not None: | |
if os.path.isabs(args['dir']): | |
targetdir = args['dir'] | |
else: | |
targetdir = os.path.abspath(args['dir']) if args['file'] is None else os.path.join(os.path.abspath(os.path.dirname(args['file'])), args['dir']) | |
else: | |
targetdir = os.getcwd() if args['file'] is None else os.path.abspath(os.path.dirname(args['file'])) | |
# figure out preferred filename for extracted subs (basename w/o extension) | |
if args['exactname'] and args['file'] is not None: | |
try_filename = os.path.splitext(os.path.basename(args['file']))[0] | |
else: | |
try_filename = False | |
# start OS HTTP-session | |
osd = OSDownloader(lang = args['lang'], max_movies = args['movies'], max_subs_per_movie = args['subs']) | |
# search subtitles, get list of subtitle ids | |
if args['file']: | |
sub_ids = osd.search_file(args['file']) | |
elif args['title']: | |
sub_ids = osd.search_title(args['title']) | |
elif args['imdb']: | |
sub_ids = osd.search_imdb_id(args['imdb']) | |
elif args['osm']: | |
sub_ids = osd.get_os_mid(args['osm']) | |
elif args['oss']: | |
sub_ids = [[args['oss']]] | |
else: | |
assert("This shouldn't be possible") | |
# download | |
osd.download_subtitles(sub_ids, targetdir, try_filename) | |
# bye! | |
print("Done!\n\n(c) CW Verhey @ https://gist.github.com/cwverhey/\nGive OpenSubtitles your support: https://www.opensubtitles.org/en/support\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment