Skip to content

Instantly share code, notes, and snippets.

@tnq177
Created July 23, 2019 17:48
Show Gist options
  • Save tnq177/13d91310c81d191f3f92c036d3a6a2ac to your computer and use it in GitHub Desktop.
Save tnq177/13d91310c81d191f3f92c036d3a6a2ac to your computer and use it in GitHub Desktop.
download opensubtitles data
import os
import sys
import requests
from homura import download
from multiprocessing import Pool
LANGUAGES = ['af', 'ar', 'bg', 'bn', 'br', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hi', 'hr', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'ka', 'kk', 'ko', 'lt', 'lv', 'mk', 'ml', 'ms', 'nl', 'no', 'pl', 'pt', 'pt_br', 'ro', 'ru', 'si', 'sk', 'sl', 'sq', 'sr', 'sv', 'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'ze_en', 'ze_zh', 'zh_cn', 'zh_tw']
DELIMITER = '<bazingaaaaa>'
def download_file(file_and_path):
file_name, savepath = file_and_path.split(DELIMITER)
url = "https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/{}".format(file_name)
r = requests.head(url)
if r.status_code != 200:
return
langs = file_name.split('.')[0]
src_lang, trg_lang = langs.split('-')
exist_1 = os.path.join(saveto, '{}-{}.txt.zip'.format(src_lang, trg_lang))
exist_2 = os.path.join(saveto, '{}-{}.txt'.format(src_lang, trg_lang))
if os.path.exists(exist_1) or os.path.exists(exist_2):
return
exist_1 = os.path.join(saveto, '{}-{}.txt.zip'.format(trg_lang, src_lang))
exist_2 = os.path.join(saveto, '{}-{}.txt'.format(trg_lang, src_lang))
if os.path.exists(exist_1) or os.path.exists(exist_2):
return
# download
path = os.path.join(savepath, file_name)
download(url=url, path=path)
print("Finished {}".format(file_name))
if __name__ == '__main__':
if len(sys.argv) != 3:
raise ValueError("Usage: python3 download_opensub.py lang saveto_path")
lang = sys.argv[1]
saveto = sys.argv[2]
file_and_paths = []
for other_lang in LANGUAGES:
if other_lang != lang:
file_and_paths.append("{}-{}.txt.zip{}{}".format(other_lang, lang, DELIMITER, saveto))
file_and_paths.append("{}-{}.txt.zip{}{}".format(lang, other_lang, DELIMITER, saveto))
p = Pool(10)
p.map(download_file, file_and_paths)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment