Skip to content

Instantly share code, notes, and snippets.

@alrojo
Created March 2, 2018 11:54
Show Gist options
  • Save alrojo/5ec26b2c2c6356f37ca73a0b475089c8 to your computer and use it in GitHub Desktop.
Save alrojo/5ec26b2c2c6356f37ca73a0b475089c8 to your computer and use it in GitHub Desktop.
wmt 2014
# made by Alexander Rosenberg Johansen
# BSD-3 License
# Hardcoded for German, but should be easily extendable to all other wmt languages
# notice: you will need handle-sgm.py as well
from six.moves.urllib.request import urlretrieve
import json
import glob
import subprocess
import os
from tqdm import tqdm
sc = subprocess.call
#normailze script
sc(["wget", "-O", "normalize-punctuation.perl", "http://statmt.org/wmt11/normalize-punctuation.perl"])
sc(["mkdir", "data"])
#download europarl
sc(["wget", "-O", "data/europarl.tgz", "statmt.org/wmt13/training-parallel-europarl-v7.tgz"])
sc(["tar", "-xvzf", "data/europarl.tgz", "-C", "data/"])
#download commoncrawl
sc(["wget", "-O", "data/cc.tgz", "statmt.org/wmt13/training-parallel-commoncrawl.tgz"])
sc(["tar", "-xvzf", "data/cc.tgz", "-C", "data/"])
#download news commentary
sc(["wget", "-O", "data/nc.tgz", "statmt.org/wmt14/training-parallel-nc-v9.tgz"])
sc(["tar", "-xvzf", "data/nc.tgz", "-C", "data/"])
#move around and delete
print('moving data around')
sc("mv ./data/training/* ./data", shell=True)
sc("rm -rf data/training", shell=True)
sc("rm -rf data/*fr*", shell=True)
sc("rm -rf data/*es*", shell=True)
sc("rm -rf data/*cs*", shell=True)
sc("rm -rf data/*ru*", shell=True)
sc("rm -rf data/*annotation", shell=True)
#DEV SET
#download newstest
print('dev set')
sc(["wget", "-O", "data/dev.tgz", "statmt.org/wmt14/dev.tgz"])
sc(["tar", "-xvzf", "data/dev.tgz", "-C", "data/"])
sc("mv ./data/dev/*.de ./data", shell=True)
sc("mv ./data/dev/*.en ./data", shell=True)
sc("rm -rf data/dev", shell=True)
sc("rm data/newsdev2014.en", shell=True)
#TEST SET
#download newstest
print('test set')
sc(["wget", "-O", "data/test-full.tgz", "statmt.org/wmt14/test-full.tgz"])
sc(["tar", "-xvzf", "data/test-full.tgz", "-C", "data/"])
sc(["python", "handle-sgm.py"])
sc("mv ./data/test-full/newstest2014.deen.de ./data", shell=True)
sc("mv ./data/test-full/newstest2014.deen.en ./data", shell=True)
sc("rm -rf data/test-full", shell=True)
#NORMALIZE PUNCTUATION
paths = glob.glob("data/*.de") + glob.glob("data/*.en")
for path in paths:
lang = path[-2:]
command1 = 'perl normalize-punctuation.perl -l'
path_norm = path + '.prenorm'
call1 = '%s %s < %s > %s' % (command1, lang, path, path_norm)
sc(call1, shell=True)
print(path_norm)
sc("rm -rf data/*.en",shell = True)
sc("rm -rf data/*.de",shell = True)
sc("cat data/commoncrawl.de-en.de.prenorm data/europarl-v7.de-en.de.prenorm data/news-commentary-v9.de-en.de.prenorm > data/WMT2014.train.deen.de.norm", shell = True )
sc("cat data/commoncrawl.de-en.en.prenorm data/europarl-v7.de-en.en.prenorm data/news-commentary-v9.de-en.en.prenorm > data/WMT2014.train.deen.en.norm", shell = True )
sc("cat data/newssyscomb2009.de.prenorm data/news-test2008.de.prenorm data/newstest2009.de.prenorm data/newstest2010.de.prenorm data/newstest2011.de.prenorm data/newstest2012.de.prenorm data/newstest2013.de.prenorm > data/WMT2014.dev.deen.de.norm", shell = True )
sc("cat data/newssyscomb2009.en.prenorm data/news-test2008.en.prenorm data/newstest2009.en.prenorm data/newstest2010.en.prenorm data/newstest2011.en.prenorm data/newstest2012.en.prenorm data/newstest2013.en.prenorm > data/WMT2014.dev.deen.en.norm", shell = True )
sc("mv data/newstest2014.deen.de.prenorm data/WMT2014.test.deen.de.norm", shell=True)
sc("mv data/newstest2014.deen.en.prenorm data/WMT2014.test.deen.en.norm", shell=True)
sc("rm -rf data/*.prenorm", shell=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment