Tatoeba Corpora Merger
import bz2
import csv
import io
import os
import tarfile
import requests
import argparse
from termcolor import colored
def get_lang_corpora(langCode):
lang_filename = langCode + "_sentences.tsv.bz2"
print (colored(('Downloading %s corpora..' % (langCode)), 'blue'))
tatoeba_corpus_download_url = "" + langCode + "/" + lang_filename
headers = {"Accept-Encoding": "gzip, deflate"}
corpora_bz2 = requests.get(tatoeba_corpus_download_url, headers=headers).content
print (colored(('Unpacking & De-Scaling %s corpora..' % (langCode)), 'blue'))
with bz2.BZ2File(io.BytesIO(corpora_bz2), mode='r') as bz2_file:
csv_file ='utf-8').splitlines()
corpora_dict = {cols[0]: cols[2] for cols in csv.reader(csv_file, delimiter="\t")}
return corpora_dict
raise Exception(colored('Failed to download and de-scale ' + langCode + ' Corpora Dict.', 'red'))
def get_corpora_links():
print (colored('Downloading Link File..', 'blue'))
tatoeba_corpora_links_download_url = ''
# headers = {"Accept-Encoding": "gzip, deflate"}
corpora_links_bz2 = requests.get(tatoeba_corpora_links_download_url).content
print (colored('Unpacking & De-Scaling link file..', 'blue'))
with, mode="r:bz2") as link_tar:
link_csv = link_tar.extractfile(link_tar.getmember('links.csv')).read().decode('ascii').splitlines()
link_dict = {cols[0]: cols[1] for cols in csv.reader(link_csv, delimiter="\t")}
return link_dict
raise Exception(colored('Failed to download and de-scale Corpora Link File.', 'red'))
def merge_corporas(corpora1: dict, corpora2: dict, links: dict):
print(colored('Merging Corporas..', 'blue'))
corpora1_keys_set = set(corpora1.keys())
corpora2_keys_set = set(corpora2.keys())
return {corpora1[link_key]: corpora2[links[link_key]] for link_key, link_value in links.items()
if link_key in corpora1_keys_set and link_value in corpora2_keys_set}
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Merge two Tatoeba corpora language dictionaries.')
parser.add_argument('--lang-code-1', '-l1', type=str, required=True,
help='The first language code', dest='langCode1')
parser.add_argument('--lang-code-2', '-l2', type=str, required=True,
help='The second language code', dest='langCode2')
parser.add_argument('--output-dir', '-o', type=str, required=False,
help='The directory to output to', dest='output')
options = parser.parse_args()
english_corp = get_lang_corpora(options.langCode1)
chinese_corp = get_lang_corpora(options.langCode2)
link_dict = get_corpora_links()
mergedCorpora_dict = merge_corporas(english_corp, chinese_corp, link_dict)
output = os.getcwd() + '/' + options.langCode1 + '_' + options.langCode2 + '.csv'
if options.output is not None and os.path.isdir(options.output):
output = options.output + '/' + options.langCode1 + '_' + options.langCode2 + '.csv'
print(colored('Outputing to %s' % output, 'blue'))
with open(output, 'w', encoding='utf-8') as f:
for key, value in mergedCorpora_dict.items():
f.write('%s\t%s\n' % (key, value))
print(colored('Corpora Merge complete.', 'green'))
