Last active
December 10, 2015 08:58
-
-
Save bradbeattie/4411556 to your computer and use it in GitHub Desktop.
Extracts a TED talk in two languages and creates a CSV of matching translated strings for flashcard use. Some potentially mistranslated entries will be stripped. Download the desired parallel corpus here: https://wit3.fbk.eu/mt.php?release=2012-02
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import unicodecsv | |
import sys | |
import argparse | |
import re | |
parser = argparse.ArgumentParser() | |
parser.add_argument("source", help="Source language file") | |
parser.add_argument("target", help="Target language file") | |
args = parser.parse_args() | |
source = open(args.source).readlines() | |
target = open(args.target).readlines() | |
def clean(sentence): | |
return sentence.replace(" ", " ").strip() | |
def fails(sentence): | |
return ( | |
sentence.count('"') % 2 | |
or " " not in sentence | |
or ": " in sentence | |
or not re.search("^[A-Z]", sentence) | |
or sentence[-1] not in (".", "!", "?") | |
) | |
writer = unicodecsv.writer(sys.stdout, encoding="utf-8", delimiter="|", quotechar="~") | |
for index in xrange(len(source)): | |
s = clean(source[index]) | |
t = clean(target[index]) | |
if ( | |
fails(s) | |
or fails(t) | |
or s == t | |
): | |
continue | |
if float(abs(len(s)-len(t)))/len(s+t) < 0.4: | |
writer.writerow((s, t)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment