andreasvc/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Tiger & Lassy train-dev-test splits

These scripts produce the train-dev-test splits for the Tiger & Lassy treebanks
used in my 2013 IWPT paper. The Tiger treebank version 2.1 was used, namely
tiger_release_aug07.export. The Lassy treebank was version 1.1, or
lassy-r19749. The reason for not just taking the last 20% for the
development & test set is to ensure a balanced distribution of sentences, which
otherwise would have an uneven distribution of length & topics.

  
## lassysplit.py
"""Create a 80-10-10 train-dev-test split of a treebank with one sentence per
file, of the form ``section/sentid.xml``, but distribute sources evenly,
denoted by a hyphen delimited prefix, e.g., 'WR-P-P-H-0000000007'. Produces
symlinks in a separate directory."""
from __future__ import print_function
import os
import glob
from collections import defaultdict
from distutils.version import LooseVersion

PATH = 'lassy-r19749/Treebank/'
os.chdir(PATH)
sents = defaultdict(list)

for sent in sorted(glob.glob('*/*.xml'), key=LooseVersion):
	source, _ = sent.split('-', 1)
	sents[source].append(sent)

os.chdir('../..')
os.mkdir('lassy-split')
os.mkdir('lassy-split/train')
os.mkdir('lassy-split/dev')
os.mkdir('lassy-split/test')

for source, chunk in sents.items():
	split1 = int(0.8 * len(chunk))
	split2 = int(0.9 * len(chunk))
	train, dev, test = chunk[:split1], chunk[split1:split2], chunk[split2:]
	for subset, name in zip([train, dev, test], "train dev test".split()):
		for sent in subset:
			base = 'lassy-split/%s/%s' % (name, os.path.split(sent)[0])
			if not os.path.exists(base):
				os.mkdir(base)
			src = '../../../' + PATH + sent
			dst = 'lassy-split/%s/%s' % (name, sent)
			os.symlink(src, dst)

## tigersplit.py
"""The train-test split described in Hall & Nivre (2008),
Parsing Discontinuous Phrase Structure with Grammatical Functions.

Corpus is divided in Sections 0-9, where sentence i is allocated to section i mod 10.
For development train on sections 2-9; evaluate on section 1.
For final evaluation (test) train on sections 1-9; evaluate on section 0.
"""
import io
import os
from discodop.treebank import NegraCorpusReader

corpus = NegraCorpusReader('tiger/corpus/tiger_release_aug07.export',
		encoding='iso-8859-1')

os.mkdir('tiger-split/')
io.open('tiger-split/tigertraindev.export', 'w', encoding='utf8').writelines(
		a for n, a in enumerate(corpus.blocks().values(), 1)
		if n % 10 > 1)
io.open('tiger-split/tigerdev.export', 'w', encoding='utf8').writelines(
		a for n, a in enumerate(corpus.blocks().values(), 1)
		if n % 10 == 1)

io.open('tiger-split/tigertraintest.export', 'w', encoding='utf8').writelines(
		a for n, a in enumerate(corpus.blocks().values(), 1)
		if n % 10 != 0)
io.open('tiger-split/tigertest.export', 'w', encoding='utf8').writelines(
		a for n, a in enumerate(corpus.blocks().values(), 1)
		if n % 10 == 0)
	"""Create a 80-10-10 train-dev-test split of a treebank with one sentence per
	file, of the form ``section/sentid.xml``, but distribute sources evenly,
	denoted by a hyphen delimited prefix, e.g., 'WR-P-P-H-0000000007'. Produces
	symlinks in a separate directory."""
	from __future__ import print_function
	import os
	import glob
	from collections import defaultdict
	from distutils.version import LooseVersion

	PATH = 'lassy-r19749/Treebank/'
	os.chdir(PATH)
	sents = defaultdict(list)

	for sent in sorted(glob.glob('/.xml'), key=LooseVersion):
	source, _ = sent.split('-', 1)
	sents[source].append(sent)

	os.chdir('../..')
	os.mkdir('lassy-split')
	os.mkdir('lassy-split/train')
	os.mkdir('lassy-split/dev')
	os.mkdir('lassy-split/test')

	for source, chunk in sents.items():
	split1 = int(0.8 * len(chunk))
	split2 = int(0.9 * len(chunk))
	train, dev, test = chunk[:split1], chunk[split1:split2], chunk[split2:]
	for subset, name in zip([train, dev, test], "train dev test".split()):
	for sent in subset:
	base = 'lassy-split/%s/%s' % (name, os.path.split(sent)[0])
	if not os.path.exists(base):
	os.mkdir(base)
	src = '../../../' + PATH + sent
	dst = 'lassy-split/%s/%s' % (name, sent)
	os.symlink(src, dst)
	"""The train-test split described in Hall & Nivre (2008),
	Parsing Discontinuous Phrase Structure with Grammatical Functions.

	Corpus is divided in Sections 0-9, where sentence i is allocated to section i mod 10.
	For development train on sections 2-9; evaluate on section 1.
	For final evaluation (test) train on sections 1-9; evaluate on section 0.
	"""
	import io
	import os
	from discodop.treebank import NegraCorpusReader

	corpus = NegraCorpusReader('tiger/corpus/tiger_release_aug07.export',
	encoding='iso-8859-1')

	os.mkdir('tiger-split/')
	io.open('tiger-split/tigertraindev.export', 'w', encoding='utf8').writelines(
	a for n, a in enumerate(corpus.blocks().values(), 1)
	if n % 10 > 1)
	io.open('tiger-split/tigerdev.export', 'w', encoding='utf8').writelines(
	a for n, a in enumerate(corpus.blocks().values(), 1)
	if n % 10 == 1)

	io.open('tiger-split/tigertraintest.export', 'w', encoding='utf8').writelines(
	a for n, a in enumerate(corpus.blocks().values(), 1)
	if n % 10 != 0)
	io.open('tiger-split/tigertest.export', 'w', encoding='utf8').writelines(
	a for n, a in enumerate(corpus.blocks().values(), 1)
	if n % 10 == 0)