Skip to content

Instantly share code, notes, and snippets.

@andreasvc
Last active May 28, 2019 07:29
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andreasvc/7507135 to your computer and use it in GitHub Desktop.
Save andreasvc/7507135 to your computer and use it in GitHub Desktop.
Tiger & Lassy train-dev-test splits

Tiger & Lassy train-dev-test splits

These scripts produce the train-dev-test splits for the Tiger & Lassy treebanks used in my 2013 IWPT paper. The Tiger treebank version 2.1 was used, namely tiger_release_aug07.export. The Lassy treebank was version 1.1, or lassy-r19749. The reason for not just taking the last 20% for the development & test set is to ensure a balanced distribution of sentences, which otherwise would have an uneven distribution of length & topics.

"""Create a 80-10-10 train-dev-test split of a treebank with one sentence per
file, of the form ``section/sentid.xml``, but distribute sources evenly,
denoted by a hyphen delimited prefix, e.g., 'WR-P-P-H-0000000007'. Produces
symlinks in a separate directory."""
from __future__ import print_function
import os
import glob
from collections import defaultdict
from distutils.version import LooseVersion
PATH = 'lassy-r19749/Treebank/'
os.chdir(PATH)
sents = defaultdict(list)
for sent in sorted(glob.glob('*/*.xml'), key=LooseVersion):
source, _ = sent.split('-', 1)
sents[source].append(sent)
os.chdir('../..')
os.mkdir('lassy-split')
os.mkdir('lassy-split/train')
os.mkdir('lassy-split/dev')
os.mkdir('lassy-split/test')
for source, chunk in sents.items():
split1 = int(0.8 * len(chunk))
split2 = int(0.9 * len(chunk))
train, dev, test = chunk[:split1], chunk[split1:split2], chunk[split2:]
for subset, name in zip([train, dev, test], "train dev test".split()):
for sent in subset:
base = 'lassy-split/%s/%s' % (name, os.path.split(sent)[0])
if not os.path.exists(base):
os.mkdir(base)
src = '../../../' + PATH + sent
dst = 'lassy-split/%s/%s' % (name, sent)
os.symlink(src, dst)
"""The train-test split described in Hall & Nivre (2008),
Parsing Discontinuous Phrase Structure with Grammatical Functions.
Corpus is divided in Sections 0-9, where sentence i is allocated to section i mod 10.
For development train on sections 2-9; evaluate on section 1.
For final evaluation (test) train on sections 1-9; evaluate on section 0.
"""
import io
import os
from discodop.treebank import NegraCorpusReader
corpus = NegraCorpusReader('tiger/corpus/tiger_release_aug07.export',
encoding='iso-8859-1')
os.mkdir('tiger-split/')
io.open('tiger-split/tigertraindev.export', 'w', encoding='utf8').writelines(
a for n, a in enumerate(corpus.blocks().values(), 1)
if n % 10 > 1)
io.open('tiger-split/tigerdev.export', 'w', encoding='utf8').writelines(
a for n, a in enumerate(corpus.blocks().values(), 1)
if n % 10 == 1)
io.open('tiger-split/tigertraintest.export', 'w', encoding='utf8').writelines(
a for n, a in enumerate(corpus.blocks().values(), 1)
if n % 10 != 0)
io.open('tiger-split/tigertest.export', 'w', encoding='utf8').writelines(
a for n, a in enumerate(corpus.blocks().values(), 1)
if n % 10 == 0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment