evgenybf/merge_tatoeba_csv.py

## merge_tatoeba_csv.py
import collections
import csv
import itertools


# http://www.manythings.org/anki/
# Russian - English rus-eng.zip (304513)
INPUT_FILE_1 = "rus.txt"

# Spanish - English spa-eng.zip (118964)
# INPUT_FILE_2 = "spa.txt"
# Chinese (Mandarin) - English cmn-eng.zip (20085)
INPUT_FILE_2 = "cmn.txt"

OUTPUT_FILE = "out.txt"


def load_phrases_as_dict(filename):
    phrases = collections.OrderedDict()
    with open(filename, "r", encoding="utf-8", newline='') as f:
        reader = csv.reader(f, delimiter='\t')
        for record in reader:
            if len(record) < 2:
                print("Skipped:", record)
                continue
            if len(record) > 2:
                print("Too many elements: {}: {}".format(len(record), record))
            key_phrase, translation, *__ignored = (x.strip() for x in record)
            phrases.setdefault(key_phrase, []).append(translation)
    return phrases


def merge_files(file1, file2, output_file):
    phrases1 = load_phrases_as_dict(file1)
    print("Key phrases in file 1:", len(phrases1))

    phrases2 = load_phrases_as_dict(file2)
    print("Key phrases in file 2:", len(phrases2))

    with open(output_file, "w", encoding="utf-8", newline='') as f:
        writer = csv.writer(f, delimiter='\t')
        skipped = 0
        for key_phrase, translations1 in phrases1.items():
            translations2 = phrases2.get(key_phrase)
            if not translations2:
                skipped += 1
                continue
            # From documentation: product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy
            for translation1, translation2 in itertools.product(translations1, translations2):
                writer.writerow((key_phrase, translation1, translation2))

    print("Phrases without translation:", skipped)


def main():
    merge_files(INPUT_FILE_1, INPUT_FILE_2, OUTPUT_FILE)


if __name__ == "__main__":
    main()
	import collections
	import csv
	import itertools


	# http://www.manythings.org/anki/
	# Russian - English rus-eng.zip (304513)
	INPUT_FILE_1 = "rus.txt"

	# Spanish - English spa-eng.zip (118964)
	# INPUT_FILE_2 = "spa.txt"
	# Chinese (Mandarin) - English cmn-eng.zip (20085)
	INPUT_FILE_2 = "cmn.txt"

	OUTPUT_FILE = "out.txt"


	def load_phrases_as_dict(filename):
	phrases = collections.OrderedDict()
	with open(filename, "r", encoding="utf-8", newline='') as f:
	reader = csv.reader(f, delimiter='\t')
	for record in reader:
	if len(record) < 2:
	print("Skipped:", record)
	continue
	if len(record) > 2:
	print("Too many elements: {}: {}".format(len(record), record))
	key_phrase, translation, *__ignored = (x.strip() for x in record)
	phrases.setdefault(key_phrase, []).append(translation)
	return phrases


	def merge_files(file1, file2, output_file):
	phrases1 = load_phrases_as_dict(file1)
	print("Key phrases in file 1:", len(phrases1))

	phrases2 = load_phrases_as_dict(file2)
	print("Key phrases in file 2:", len(phrases2))

	with open(output_file, "w", encoding="utf-8", newline='') as f:
	writer = csv.writer(f, delimiter='\t')
	skipped = 0
	for key_phrase, translations1 in phrases1.items():
	translations2 = phrases2.get(key_phrase)
	if not translations2:
	skipped += 1
	continue
	# From documentation: product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy
	for translation1, translation2 in itertools.product(translations1, translations2):
	writer.writerow((key_phrase, translation1, translation2))

	print("Phrases without translation:", skipped)


	def main():
	merge_files(INPUT_FILE_1, INPUT_FILE_2, OUTPUT_FILE)


	if __name__ == "__main__":
	main()