Skip to content

Instantly share code, notes, and snippets.

@joeminicucci
Last active February 4, 2021 22:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joeminicucci/fd84dfc0d4d9438a00abfad8f42f8d2f to your computer and use it in GitHub Desktop.
Save joeminicucci/fd84dfc0d4d9438a00abfad8f42f8d2f to your computer and use it in GitHub Desktop.
Tatoeba Corpora Merger
import bz2
import csv
import io
import os
import tarfile
import requests
import argparse
from termcolor import colored
# https://downloads.tatoeba.org/exports/per_language/eng/eng_sentences.tsv.bz2
def get_lang_corpora(langCode):
lang_filename = langCode + "_sentences.tsv.bz2"
try:
print (colored(('Downloading %s corpora..' % (langCode)), 'blue'))
tatoeba_corpus_download_url = "https://downloads.tatoeba.org:443/exports/per_language/" + langCode + "/" + lang_filename
headers = {"Accept-Encoding": "gzip, deflate"}
corpora_bz2 = requests.get(tatoeba_corpus_download_url, headers=headers).content
print (colored(('Unpacking & De-Scaling %s corpora..' % (langCode)), 'blue'))
with bz2.BZ2File(io.BytesIO(corpora_bz2), mode='r') as bz2_file:
csv_file = bz2_file.read().decode('utf-8').splitlines()
corpora_dict = {cols[0]: cols[2] for cols in csv.reader(csv_file, delimiter="\t")}
return corpora_dict
except:
raise Exception(colored('Failed to download and de-scale ' + langCode + ' Corpora Dict.', 'red'))
def get_corpora_links():
# https://downloads.tatoeba.org/exports/links.tar.bz2
try:
print (colored('Downloading Link File..', 'blue'))
tatoeba_corpora_links_download_url = 'https://downloads.tatoeba.org/exports/links.tar.bz2'
# headers = {"Accept-Encoding": "gzip, deflate"}
corpora_links_bz2 = requests.get(tatoeba_corpora_links_download_url).content
print (colored('Unpacking & De-Scaling link file..', 'blue'))
with tarfile.open(fileobj=io.BytesIO(corpora_links_bz2), mode="r:bz2") as link_tar:
link_csv = link_tar.extractfile(link_tar.getmember('links.csv')).read().decode('ascii').splitlines()
link_dict = {cols[0]: cols[1] for cols in csv.reader(link_csv, delimiter="\t")}
return link_dict
except:
raise Exception(colored('Failed to download and de-scale Corpora Link File.', 'red'))
def merge_corporas(corpora1: dict, corpora2: dict, links: dict):
print(colored('Merging Corporas..', 'blue'))
corpora1_keys_set = set(corpora1.keys())
corpora2_keys_set = set(corpora2.keys())
return {corpora1[link_key]: corpora2[links[link_key]] for link_key, link_value in links.items()
if link_key in corpora1_keys_set and link_value in corpora2_keys_set}
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Merge two Tatoeba corpora language dictionaries.')
parser.add_argument('--lang-code-1', '-l1', type=str, required=True,
help='The first language code', dest='langCode1')
parser.add_argument('--lang-code-2', '-l2', type=str, required=True,
help='The second language code', dest='langCode2')
parser.add_argument('--output-dir', '-o', type=str, required=False,
help='The directory to output to', dest='output')
options = parser.parse_args()
english_corp = get_lang_corpora(options.langCode1)
chinese_corp = get_lang_corpora(options.langCode2)
link_dict = get_corpora_links()
mergedCorpora_dict = merge_corporas(english_corp, chinese_corp, link_dict)
output = os.getcwd() + '/' + options.langCode1 + '_' + options.langCode2 + '.csv'
if options.output is not None and os.path.isdir(options.output):
output = options.output + '/' + options.langCode1 + '_' + options.langCode2 + '.csv'
print(colored('Outputing to %s' % output, 'blue'))
with open(output, 'w', encoding='utf-8') as f:
for key, value in mergedCorpora_dict.items():
f.write('%s\t%s\n' % (key, value))
print(colored('Corpora Merge complete.', 'green'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment