alrojo/dl.py

## dl.py
# made by Alexander Rosenberg Johansen
# BSD-3 License
# Hardcoded for German, but should be easily extendable to all other wmt languages
# notice: you will need handle-sgm.py as well
from six.moves.urllib.request import urlretrieve
import json
import glob
import subprocess
import os
from tqdm import tqdm

sc = subprocess.call

#normailze script
sc(["wget", "-O", "normalize-punctuation.perl", "http://statmt.org/wmt11/normalize-punctuation.perl"])

sc(["mkdir", "data"])

#download europarl
sc(["wget", "-O", "data/europarl.tgz", "statmt.org/wmt13/training-parallel-europarl-v7.tgz"])
sc(["tar", "-xvzf", "data/europarl.tgz", "-C", "data/"])

#download commoncrawl
sc(["wget", "-O", "data/cc.tgz", "statmt.org/wmt13/training-parallel-commoncrawl.tgz"])
sc(["tar", "-xvzf", "data/cc.tgz", "-C", "data/"])

#download news commentary
sc(["wget", "-O", "data/nc.tgz", "statmt.org/wmt14/training-parallel-nc-v9.tgz"])
sc(["tar", "-xvzf", "data/nc.tgz", "-C", "data/"])

#move around and delete
print('moving data around')
sc("mv ./data/training/* ./data", shell=True)
sc("rm -rf data/training", shell=True)
sc("rm -rf data/*fr*", shell=True)
sc("rm -rf data/*es*", shell=True)
sc("rm -rf data/*cs*", shell=True)
sc("rm -rf data/*ru*", shell=True)
sc("rm -rf data/*annotation", shell=True)

#DEV SET
#download newstest
print('dev set')
sc(["wget", "-O", "data/dev.tgz", "statmt.org/wmt14/dev.tgz"])
sc(["tar", "-xvzf", "data/dev.tgz", "-C", "data/"])
sc("mv ./data/dev/*.de ./data", shell=True)
sc("mv ./data/dev/*.en ./data", shell=True)
sc("rm -rf data/dev", shell=True)
sc("rm data/newsdev2014.en", shell=True)

#TEST SET
#download newstest

print('test set')
sc(["wget", "-O", "data/test-full.tgz", "statmt.org/wmt14/test-full.tgz"])
sc(["tar", "-xvzf", "data/test-full.tgz", "-C", "data/"])
sc(["python", "handle-sgm.py"])
sc("mv ./data/test-full/newstest2014.deen.de ./data", shell=True)
sc("mv ./data/test-full/newstest2014.deen.en ./data", shell=True)
sc("rm -rf data/test-full", shell=True)

#NORMALIZE PUNCTUATION
paths = glob.glob("data/*.de") + glob.glob("data/*.en")
for path in paths:
    lang = path[-2:]
    command1 = 'perl normalize-punctuation.perl -l'
    path_norm = path + '.prenorm'
    call1 = '%s %s < %s > %s' % (command1, lang, path, path_norm)
    sc(call1, shell=True)
    print(path_norm)

sc("rm -rf data/*.en",shell = True)
sc("rm -rf data/*.de",shell = True)

sc("cat data/commoncrawl.de-en.de.prenorm data/europarl-v7.de-en.de.prenorm data/news-commentary-v9.de-en.de.prenorm > data/WMT2014.train.deen.de.norm", shell = True )
sc("cat data/commoncrawl.de-en.en.prenorm data/europarl-v7.de-en.en.prenorm data/news-commentary-v9.de-en.en.prenorm > data/WMT2014.train.deen.en.norm", shell = True )

sc("cat data/newssyscomb2009.de.prenorm data/news-test2008.de.prenorm data/newstest2009.de.prenorm data/newstest2010.de.prenorm data/newstest2011.de.prenorm data/newstest2012.de.prenorm data/newstest2013.de.prenorm > data/WMT2014.dev.deen.de.norm", shell = True )
sc("cat data/newssyscomb2009.en.prenorm data/news-test2008.en.prenorm data/newstest2009.en.prenorm data/newstest2010.en.prenorm data/newstest2011.en.prenorm data/newstest2012.en.prenorm data/newstest2013.en.prenorm > data/WMT2014.dev.deen.en.norm", shell = True )

sc("mv data/newstest2014.deen.de.prenorm data/WMT2014.test.deen.de.norm", shell=True)
sc("mv data/newstest2014.deen.en.prenorm data/WMT2014.test.deen.en.norm", shell=True)

sc("rm -rf data/*.prenorm", shell=True)
	# made by Alexander Rosenberg Johansen
	# BSD-3 License
	# Hardcoded for German, but should be easily extendable to all other wmt languages
	# notice: you will need handle-sgm.py as well
	from six.moves.urllib.request import urlretrieve
	import json
	import glob
	import subprocess
	import os
	from tqdm import tqdm

	sc = subprocess.call

	#normailze script
	sc(["wget", "-O", "normalize-punctuation.perl", "http://statmt.org/wmt11/normalize-punctuation.perl"])

	sc(["mkdir", "data"])

	#download europarl
	sc(["wget", "-O", "data/europarl.tgz", "statmt.org/wmt13/training-parallel-europarl-v7.tgz"])
	sc(["tar", "-xvzf", "data/europarl.tgz", "-C", "data/"])

	#download commoncrawl
	sc(["wget", "-O", "data/cc.tgz", "statmt.org/wmt13/training-parallel-commoncrawl.tgz"])
	sc(["tar", "-xvzf", "data/cc.tgz", "-C", "data/"])

	#download news commentary
	sc(["wget", "-O", "data/nc.tgz", "statmt.org/wmt14/training-parallel-nc-v9.tgz"])
	sc(["tar", "-xvzf", "data/nc.tgz", "-C", "data/"])

	#move around and delete
	print('moving data around')
	sc("mv ./data/training/* ./data", shell=True)
	sc("rm -rf data/training", shell=True)
	sc("rm -rf data/fr", shell=True)
	sc("rm -rf data/es", shell=True)
	sc("rm -rf data/cs", shell=True)
	sc("rm -rf data/ru", shell=True)
	sc("rm -rf data/*annotation", shell=True)

	#DEV SET
	#download newstest
	print('dev set')
	sc(["wget", "-O", "data/dev.tgz", "statmt.org/wmt14/dev.tgz"])
	sc(["tar", "-xvzf", "data/dev.tgz", "-C", "data/"])
	sc("mv ./data/dev/*.de ./data", shell=True)
	sc("mv ./data/dev/*.en ./data", shell=True)
	sc("rm -rf data/dev", shell=True)
	sc("rm data/newsdev2014.en", shell=True)

	#TEST SET
	#download newstest

	print('test set')
	sc(["wget", "-O", "data/test-full.tgz", "statmt.org/wmt14/test-full.tgz"])
	sc(["tar", "-xvzf", "data/test-full.tgz", "-C", "data/"])
	sc(["python", "handle-sgm.py"])
	sc("mv ./data/test-full/newstest2014.deen.de ./data", shell=True)
	sc("mv ./data/test-full/newstest2014.deen.en ./data", shell=True)
	sc("rm -rf data/test-full", shell=True)

	#NORMALIZE PUNCTUATION
	paths = glob.glob("data/.de") + glob.glob("data/.en")
	for path in paths:
	lang = path[-2:]
	command1 = 'perl normalize-punctuation.perl -l'
	path_norm = path + '.prenorm'
	call1 = '%s %s < %s > %s' % (command1, lang, path, path_norm)
	sc(call1, shell=True)
	print(path_norm)

	sc("rm -rf data/*.en",shell = True)
	sc("rm -rf data/*.de",shell = True)

	sc("cat data/commoncrawl.de-en.de.prenorm data/europarl-v7.de-en.de.prenorm data/news-commentary-v9.de-en.de.prenorm > data/WMT2014.train.deen.de.norm", shell = True )
	sc("cat data/commoncrawl.de-en.en.prenorm data/europarl-v7.de-en.en.prenorm data/news-commentary-v9.de-en.en.prenorm > data/WMT2014.train.deen.en.norm", shell = True )

	sc("cat data/newssyscomb2009.de.prenorm data/news-test2008.de.prenorm data/newstest2009.de.prenorm data/newstest2010.de.prenorm data/newstest2011.de.prenorm data/newstest2012.de.prenorm data/newstest2013.de.prenorm > data/WMT2014.dev.deen.de.norm", shell = True )
	sc("cat data/newssyscomb2009.en.prenorm data/news-test2008.en.prenorm data/newstest2009.en.prenorm data/newstest2010.en.prenorm data/newstest2011.en.prenorm data/newstest2012.en.prenorm data/newstest2013.en.prenorm > data/WMT2014.dev.deen.en.norm", shell = True )

	sc("mv data/newstest2014.deen.de.prenorm data/WMT2014.test.deen.de.norm", shell=True)
	sc("mv data/newstest2014.deen.en.prenorm data/WMT2014.test.deen.en.norm", shell=True)

	sc("rm -rf data/*.prenorm", shell=True)