Extracts a TED talk in two languages and creates a CSV of matching translated strings for flashcard use. Some potentially mistranslated entries will be stripped. Download the desired parallel corpus here:
import unicodecsv
import sys
import argparse
import re
parser = argparse.ArgumentParser()
parser.add_argument("source", help="Source language file")
parser.add_argument("target", help="Target language file")
args = parser.parse_args()
source = open(args.source).readlines()
target = open(
def clean(sentence):
return sentence.replace(" ", " ").strip()
def fails(sentence):
return (
sentence.count('"') % 2
or " " not in sentence
or ": " in sentence
or not"^[A-Z]", sentence)
or sentence[-1] not in (".", "!", "?")
writer = unicodecsv.writer(sys.stdout, encoding="utf-8", delimiter="|", quotechar="~")
for index in xrange(len(source)):
s = clean(source[index])
t = clean(target[index])
if (
or fails(t)
or s == t
if float(abs(len(s)-len(t)))/len(s+t) < 0.4:
writer.writerow((s, t))
