Skip to content

Instantly share code, notes, and snippets.

@evgenybf
Last active May 27, 2018 13:35
Show Gist options
  • Save evgenybf/34d6a9bf037a3ecb010019e18b3c0bed to your computer and use it in GitHub Desktop.
Save evgenybf/34d6a9bf037a3ecb010019e18b3c0bed to your computer and use it in GitHub Desktop.
import collections
import csv
import itertools
# http://www.manythings.org/anki/
# Russian - English rus-eng.zip (304513)
INPUT_FILE_1 = "rus.txt"
# Spanish - English spa-eng.zip (118964)
# INPUT_FILE_2 = "spa.txt"
# Chinese (Mandarin) - English cmn-eng.zip (20085)
INPUT_FILE_2 = "cmn.txt"
OUTPUT_FILE = "out.txt"
def load_phrases_as_dict(filename):
phrases = collections.OrderedDict()
with open(filename, "r", encoding="utf-8", newline='') as f:
reader = csv.reader(f, delimiter='\t')
for record in reader:
if len(record) < 2:
print("Skipped:", record)
continue
if len(record) > 2:
print("Too many elements: {}: {}".format(len(record), record))
key_phrase, translation, *__ignored = (x.strip() for x in record)
phrases.setdefault(key_phrase, []).append(translation)
return phrases
def merge_files(file1, file2, output_file):
phrases1 = load_phrases_as_dict(file1)
print("Key phrases in file 1:", len(phrases1))
phrases2 = load_phrases_as_dict(file2)
print("Key phrases in file 2:", len(phrases2))
with open(output_file, "w", encoding="utf-8", newline='') as f:
writer = csv.writer(f, delimiter='\t')
skipped = 0
for key_phrase, translations1 in phrases1.items():
translations2 = phrases2.get(key_phrase)
if not translations2:
skipped += 1
continue
# From documentation: product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy
for translation1, translation2 in itertools.product(translations1, translations2):
writer.writerow((key_phrase, translation1, translation2))
print("Phrases without translation:", skipped)
def main():
merge_files(INPUT_FILE_1, INPUT_FILE_2, OUTPUT_FILE)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment