Skip to content

Instantly share code, notes, and snippets.

@cwverhey
Last active February 1, 2022 14:52
Show Gist options
  • Save cwverhey/2cfeb5ce8603a626bd62de321a382214 to your computer and use it in GitHub Desktop.
Save cwverhey/2cfeb5ce8603a626bd62de321a382214 to your computer and use it in GitHub Desktop.
OpenSubtitles.org subtitle search and downloading, with automatic unzipping
#!/usr/bin/env python3
import os
import re
from requests import Session
from argparse import ArgumentParser
from zipfile import ZipFile
from tempfile import TemporaryDirectory
try:
from cv2 import VideoCapture, CAP_PROP_FPS
except ModuleNotFoundError:
print('\033[1;97;101mWarning\033[0m Unable to import cv2. Will not be able to determine FPS for --file.')
class OSDownloader:
def __init__(self, lang, max_movies, max_subs_per_movie):
'''
lang (str) language, eg 'eng' or 'eng,dut,ger'
max_movies (int) maximum number of movies to fetch from search results
max_subs_per_movie (int) max number of subtitles to fetch per movie
'''
# setup session
self.session = Session()
self.session.headers.update({"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
"DNT": "1",
"Referer": "https://www.opensubtitles.org/en"
})
# create tempdir
self.tempdir = TemporaryDirectory(prefix='opensubtitles.org-')
# store settings
self.lang = lang.split(',')
self.max_movies = max_movies
self.max_subs_per_movie = max_subs_per_movie
def search_file(self, path):
'''Get subtitle IDs by filename and FPS search. Input: file path. Output: list of list of subtitle IDs per movie'''
name = self.get_name(path)
fps = self.get_fps(path)
params = {'MovieName': name, 'id': 8, 'action': 'search',
'SubLanguageID': self.lang, 'Season': '', 'Episode': '',
'SubSumCD': '', 'Genre': '', 'MovieByteSize': '',
'MovieLanguage': '', 'MovieImdbRatingSign': '',
'MovieImdbRating': '', 'MovieCountry': '',
'MovieYearSign': 1, 'MovieYear': '', 'MovieFPS': fps,
'SubFormat': '', 'SubAddDate': '', 'Uploader': '',
'IDUser': '', 'Translator': '', 'IMDBID': '',
'MovieHash': '', 'IDMovie': ''
}
return self.search(params)
def get_name(self,path):
'''Returns filename of movie without extension'''
name = os.path.splitext(os.path.basename(path))[0]
return name
def get_fps(self,path):
'''Returns movie FPS as string, if it is in the OpenSubtitles list of FPS options'''
try:
movie = VideoCapture(path)
fps = movie.get(CAP_PROP_FPS)
except NameError:
return ''
options = {'23.976': 23.976, '23.980': 23.980, '24.000': 24.000,
'25.000': 25.000, '29.970': 29.970, '30.000': 30.000,
'50.000': 50.000, '59.940': 59.940, '60.000': 60.000}
for key,val in options.items():
if(abs(fps - val) < 0.002):
return key
return ''
def search_title(self, title):
'''Get subtitle IDs by movie title and year search. Input: string. A 4-digit year is taken from the string automatically. Output: list of list of subtitle IDs'''
search_year = re.search(r'\b(\d{4})\b',title)
if search_year:
year = search_year.group()
title = title.replace(year,'')
else:
year = ''
params = {'MovieName': title, 'id': 8, 'action': 'search',
'SubLanguageID': self.lang, 'Season': '', 'Episode': '',
'SubSumCD': '', 'Genre': '', 'MovieByteSize': '',
'MovieLanguage': '', 'MovieImdbRatingSign': '',
'MovieImdbRating': '', 'MovieCountry': '',
'MovieYearSign': 1, 'MovieYear': year, 'MovieFPS': '',
'SubFormat': '', 'SubAddDate': '', 'Uploader': '',
'IDUser': '', 'Translator': '', 'IMDBID': '',
'MovieHash': '', 'IDMovie': ''
}
return self.search(params)
def search_imdb_id(self, imdb_id):
'''Get subtitle IDs by IMDB movie ID. Output: list of list of subtitle IDs'''
params = {'MovieName': '', 'id': 8, 'action': 'search',
'SubLanguageID': self.lang, 'Season': '', 'Episode': '',
'SubSumCD': '', 'Genre': '', 'MovieByteSize': '',
'MovieLanguage': '', 'MovieImdbRatingSign': '',
'MovieImdbRating': '', 'MovieCountry': '',
'MovieYearSign': 1, 'MovieYear': '', 'MovieFPS': '',
'SubFormat': '', 'SubAddDate': '', 'Uploader': '',
'IDUser': '', 'Translator': '', 'IMDBID': imdb_id,
'MovieHash': '', 'IDMovie': ''
}
return self.search(params)
def get_os_mid(self, os_mid):
'''Get subtitle IDs by OpenSubtitles movie ID. Output: list of list of subtitle IDs'''
lang = ','.join(self.lang)
url = f'https://www.opensubtitles.org/en/search/sublanguageid-{lang}/idmovie-{os_mid}'
subtitles = [self.get_os_movieurl(url)]
return subtitles
def search(self, params):
'''Performs search for subtitles on opensubtitles.org, input: search parameters, output: list of list of subtitle IDs per movie'''
url = 'https://www.opensubtitles.org/en/search2'
r = self.session.get(url=url, params=params)
if(re.search('http://schema.org/Movie', r.text)):
# result is a single movie page
return [self.list_subtitles(r.text)]
else:
# result is a list of movies
movieurls = re.findall('<a class="bnone" title=".*?" href="(.*?)">(.*?)</a>', r.text, re.DOTALL)
print('Found {} movies\n'.format(len(movieurls)))
subtitles = []
for i,(url,txt) in enumerate(movieurls):
if i == self.max_movies: break
print("{}\nhttps://www.opensubtitles.org{}".format(txt.replace("\n"," "),url))
subtitles.append(self.get_os_movieurl("https://www.opensubtitles.org"+url))
return subtitles
def get_os_movieurl(self, movieurl):
'''Lists subtitles on a single movie results page, input: URL'''
r = self.session.get(url=movieurl)
return self.list_subtitles(r.text)
def list_subtitles(self, content):
'''Lists subtitles on a single movie results page, input: content string'''
subtitles = re.findall('id="main.*?<br />(.*?)<br />.*?<a href="/en/subtitleserve/sub/(\d+)".*?</tr>', content, re.DOTALL)
print('{} subtitles'.format(len(subtitles)))
for t,n in subtitles:
span = re.search('(.*?)<span title="(.*?)">.*?</span>(.*?)', t, re.DOTALL)
if span:
t = span.group(1) + span.group(2) + span.group(3)
t = t.replace("\n"," ").strip()
print(f"- {n}: {t}")
print()
return [sub[1] for sub in subtitles]
def download_subtitles(self, sub_ids, targetdir, try_filename = False):
'''Downloads subtitles with download_subtitle() from list of movies, one list of sub_ids per movie'''
for movie in range(0, len(sub_ids)):
for sub in range(0, min(self.max_subs_per_movie, len(sub_ids[movie]))):
self.download_subtitle(sub_id=sub_ids[movie][sub], targetdir=targetdir, try_filename=try_filename)
def download_subtitle(self, sub_id, targetdir, try_filename = False):
'''Downloads subtitles zip and extracts, input: subtitle ID'''
print(f"download_subtitle({sub_id})")
# zip file download
r = self.session.get(url=f"https://www.opensubtitles.org/en/subtitleserve/sub/{sub_id}")
if r.status_code != 200:
raise RuntimeError(f"download_subtitle({sub_id}) status code: {r.status_code}")
# save zip to temp file
zipfile = os.path.join(self.tempdir.name,f"download_{sub_id}.zip")
with open(zipfile, "wb") as zipfh:
for chunk in r.iter_content():
if chunk: zipfh.write(chunk) # filter out keep-alive chunks
print(f"saved compressed archive to {zipfile}")
# create target dir and cd to it
if not os.path.exists(targetdir):
print(f"creating {targetdir}")
os.makedirs(targetdir)
os.chdir(targetdir)
# extract zip
print('extracting to {}:'.format(os.getcwd()))
with ZipFile(zipfile, 'r') as zipObj:
for filename in zipObj.namelist():
if filename.endswith('.nfo'):
print(f'[SKIP] {filename}')
else:
if(os.path.exists(filename)):
print(f'[SKIP] {filename} (exists)')
else:
print(f'\033[1;32;40m[ OK ]\033[0m {filename}')
zipObj.extract(filename)
if try_filename is not False:
target_filename = try_filename + os.path.splitext(filename)[1]
print(filename, try_filename, target_filename)
if not os.path.exists(target_filename):
print(f' -> {target_filename}')
os.rename(filename, target_filename)
print()
# def printreq(r,s):
# print('--------------------------------------------------------------\n')
# print(r.url)
# print('\n==REQUEST===========================')
# print(r.request.headers)
# print('==RESPONSE==========================')
# print(r.status_code)
# print(r.headers)
# print(r.text[:100])
# print('==COOKIES===========================')
# print(s.cookies.get_dict())
# print('--------------------------------------------------------------')
if __name__ == "__main__":
# parse arguments
parser = ArgumentParser(description='Download subtitles from OpenSubtitles.org, search either for a file (extracts filename and FPS), the movie\'s title, the movie\'s IMDB ID or its OpenSubtitles ID. One of --file, --title, --imdb, --osm or --oss must be supplied.')
parser.add_argument('--file', '-f', help='movie file', metavar='FILE')
parser.add_argument('--title', '-t', help='(partial) movie title, you can add the year as 4 digits eg. \'Matrix 2021\'', metavar='TIT')
parser.add_argument('--imdb', '-i', help='IMDB ID (eg. \'10838180\' in https://www.imdb.com/title/tt10838180/)', metavar='ID')
parser.add_argument('--osm', '-m', help='OpenSubtitles movie ID (eg. \'915058\' in https://www.opensubtitles.org/en/search/sublanguageid-eng/idmovie-915058)', metavar='ID')
parser.add_argument('--oss', '-s', help='OpenSubtitles subtitle ID (eg. \'8917474\' in https://www.opensubtitles.org/en/subtitleserve/sub/8917474)', metavar='ID')
parser.add_argument('--lang', '-l', default='eng', help='subtitle language(s) to search/download (eg: \'eng,ger,fra\', default: \'eng\')', metavar='LANG')
parser.add_argument('--dir', '-d', help='folder to download subs into (relative to default), defaults to movie dir for --file, or current dir for other options', metavar='PATH')
parser.add_argument('--exactname', '-e', action='store_true', help='name the first downloaded subtitle identical to the movie (only works with --file)')
parser.add_argument('--movies', default=4, help='max. number of movie results to download subs for (default: 4)', metavar='NUM')
parser.add_argument('--subs', default=4, help='max. number of subtitles to download per movie (default: 4)', metavar='NUM')
parser.add_argument('-n', help='shorthand for --movies NUM --subs NUM', metavar='NUM')
args, unknown = parser.parse_known_args()
# clean arguments
def clean_bracketed_paste(str_in):
if not isinstance(str_in, str):
return(str_in)
return re.sub(r'^\d\d~|\d\d~$', '', str_in)
args = {k:clean_bracketed_paste(v) for k,v in vars(args).items()}
unknown = [clean_bracketed_paste(v) for v in unknown]
unknown = [v for v in unknown if v != '']
# handle missing arguments
if args['file'] is None and args['title'] is None and args['imdb'] is None and args['osm'] is None and args['oss'] is None:
if len(unknown) > 0:
# unknown argument becomes file argument
args['file'] = unknown[0]
else:
# show help and exit
parser.print_help()
raise SystemExit(1)
# apply `n` argument
if args['n'] is not None:
args['movies'] = args['n']
args['subs'] = args['n']
# figure out absolute path to target directory
if args['dir'] is not None:
if os.path.isabs(args['dir']):
targetdir = args['dir']
else:
targetdir = os.path.abspath(args['dir']) if args['file'] is None else os.path.join(os.path.abspath(os.path.dirname(args['file'])), args['dir'])
else:
targetdir = os.getcwd() if args['file'] is None else os.path.abspath(os.path.dirname(args['file']))
# figure out preferred filename for extracted subs (basename w/o extension)
if args['exactname'] and args['file'] is not None:
try_filename = os.path.splitext(os.path.basename(args['file']))[0]
else:
try_filename = False
# start OS HTTP-session
osd = OSDownloader(lang = args['lang'], max_movies = args['movies'], max_subs_per_movie = args['subs'])
# search subtitles, get list of subtitle ids
if args['file']:
sub_ids = osd.search_file(args['file'])
elif args['title']:
sub_ids = osd.search_title(args['title'])
elif args['imdb']:
sub_ids = osd.search_imdb_id(args['imdb'])
elif args['osm']:
sub_ids = osd.get_os_mid(args['osm'])
elif args['oss']:
sub_ids = [[args['oss']]]
else:
assert("This shouldn't be possible")
# download
osd.download_subtitles(sub_ids, targetdir, try_filename)
# bye!
print("Done!\n\n(c) CW Verhey @ https://gist.github.com/cwverhey/\nGive OpenSubtitles your support: https://www.opensubtitles.org/en/support\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment