Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Tatoeba Corpora Merger
import bz2
import csv
import io
import os
import tarfile
import requests
import argparse
from termcolor import colored
# https://downloads.tatoeba.org/exports/per_language/eng/eng_sentences.tsv.bz2
def get_lang_corpora(langCode):
lang_filename = langCode + "_sentences.tsv.bz2"
try:
print (colored(('Downloading %s corpora..' % (langCode)), 'blue'))
tatoeba_corpus_download_url = "https://downloads.tatoeba.org:443/exports/per_language/" + langCode + "/" + lang_filename
headers = {"Accept-Encoding": "gzip, deflate"}
corpora_bz2 = requests.get(tatoeba_corpus_download_url, headers=headers).content
print (colored(('Unpacking & De-Scaling %s corpora..' % (langCode)), 'blue'))
with bz2.BZ2File(io.BytesIO(corpora_bz2), mode='r') as bz2_file:
csv_file = bz2_file.read().decode('utf-8').splitlines()
corpora_dict = {cols[0]: cols[2] for cols in csv.reader(csv_file, delimiter="\t")}
return corpora_dict
except:
raise Exception(colored('Failed to download and de-scale ' + langCode + ' Corpora Dict.', 'red'))
def get_corpora_links():
# https://downloads.tatoeba.org/exports/links.tar.bz2
try:
print (colored('Downloading Link File..', 'blue'))
tatoeba_corpora_links_download_url = 'https://downloads.tatoeba.org/exports/links.tar.bz2'
# headers = {"Accept-Encoding": "gzip, deflate"}
corpora_links_bz2 = requests.get(tatoeba_corpora_links_download_url).content
print (colored('Unpacking & De-Scaling link file..', 'blue'))
with tarfile.open(fileobj=io.BytesIO(corpora_links_bz2), mode="r:bz2") as link_tar:
link_csv = link_tar.extractfile(link_tar.getmember('links.csv')).read().decode('ascii').splitlines()
link_dict = {cols[0]: cols[1] for cols in csv.reader(link_csv, delimiter="\t")}
return link_dict
except:
raise Exception(colored('Failed to download and de-scale Corpora Link File.', 'red'))
def merge_corporas(corpora1: dict, corpora2: dict, links: dict):
print(colored('Merging Corporas..', 'blue'))
corpora1_keys_set = set(corpora1.keys())
corpora2_keys_set = set(corpora2.keys())
return {corpora1[link_key]: corpora2[links[link_key]] for link_key, link_value in links.items()
if link_key in corpora1_keys_set and link_value in corpora2_keys_set}
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Merge two Tatoeba corpora language dictionaries.')
parser.add_argument('--lang-code-1', '-l1', type=str, required=True,
help='The first language code', dest='langCode1')
parser.add_argument('--lang-code-2', '-l2', type=str, required=True,
help='The second language code', dest='langCode2')
parser.add_argument('--output-dir', '-o', type=str, required=False,
help='The directory to output to', dest='output')
options = parser.parse_args()
english_corp = get_lang_corpora(options.langCode1)
chinese_corp = get_lang_corpora(options.langCode2)
link_dict = get_corpora_links()
mergedCorpora_dict = merge_corporas(english_corp, chinese_corp, link_dict)
output = os.getcwd() + '/' + options.langCode1 + '_' + options.langCode2 + '.csv'
if options.output is not None and os.path.isdir(options.output):
output = options.output + '/' + options.langCode1 + '_' + options.langCode2 + '.csv'
print(colored('Outputing to %s' % output, 'blue'))
with open(output, 'w', encoding='utf-8') as f:
for key, value in mergedCorpora_dict.items():
f.write('%s\t%s\n' % (key, value))
print(colored('Corpora Merge complete.', 'green'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment