syllog1sm/gist:10343947

## gistfile1.py
"""A simple implementation of a greedy transition-based parser. Released under BSD license."""
from os import path
import os
import sys
from collections import defaultdict
import random
import time
import pickle

SHIFT = 0; RIGHT = 1; LEFT = 2;
MOVES = (SHIFT, RIGHT, LEFT)
START = ['-START-', '-START2-']
END = ['-END-', '-END2-']


class DefaultList(list):
    """A list that returns a default value if index out of bounds."""
    def __init__(self, default=None):
        self.default = default
        list.__init__(self)

    def __getitem__(self, index):
        try:
            return list.__getitem__(self, index)
        except IndexError:
            return self.default


class Parse(object):
    def __init__(self, n):
        self.n = n
        self.heads = [None] * (n-1)
        self.labels = [None] * (n-1)
        self.lefts = []
        self.rights = []
        for i in range(n+1):
            self.lefts.append(DefaultList(0))
            self.rights.append(DefaultList(0))

    def add(self, head, child, label=None):
        self.heads[child] = head
        self.labels[child] = label
        if child < head:
            self.lefts[head].append(child)
        else:
            self.rights[head].append(child)


class Parser(object):
    def __init__(self, load=True):
        model_dir = os.path.dirname(__file__)
        self.model = Perceptron(MOVES)
        if load:
            self.model.load(path.join(model_dir, 'parser.pickle'))
        self.tagger = PerceptronTagger(load=load)
        self.confusion_matrix = defaultdict(lambda: defaultdict(int))

    def save(self):
        self.model.save(path.join(os.path.dirname(__file__), 'parser.pickle'))
        self.tagger.save()

    def parse(self, words):
        n = len(words)
        i = 2; stack = [1]; parse = Parse(n)
        tags = self.tagger.tag(words)
        while stack or (i+1) < n:
            features = extract_features(words, tags, i, n, stack, parse)
            scores = self.model.score(features)
            valid_moves = get_valid_moves(i, n, len(stack))
            guess = max(valid_moves, key=lambda move: scores[move])
            i = transition(guess, i, stack, parse)
        return tags, parse.heads

    def train_one(self, itn, words, gold_tags, gold_heads):
        n = len(words)
        i = 2; stack = [1]; parse = Parse(n)
        tags = self.tagger.tag(words)
        while stack or (i + 1) < n:
            features = extract_features(words, tags, i, n, stack, parse)
            scores = self.model.score(features)
            valid_moves = get_valid_moves(i, n, len(stack))
            gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads)
            guess = max(valid_moves, key=lambda move: scores[move])
            assert gold_moves
            best = max(gold_moves, key=lambda move: scores[move])
            self.model.update(best, guess, features)
            i = transition(guess, i, stack, parse)
            self.confusion_matrix[best][guess] += 1
        return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]])


def transition(move, i, stack, parse):
    if move == SHIFT:
        stack.append(i)
        return i + 1
    elif move == RIGHT:
        parse.add(stack[-2], stack.pop())
        return i
    elif move == LEFT:
        parse.add(i, stack.pop())
        return i
    assert move in MOVES


def get_valid_moves(i, n, stack_depth):
    moves = []
    if (i+1) < n:
        moves.append(SHIFT)
    if stack_depth >= 2:
        moves.append(RIGHT)
    if stack_depth >= 1:
        moves.append(LEFT)
    return moves


def get_gold_moves(n0, n, stack, heads, gold):
    def deps_between(target, others, gold):
        for word in others:
            if gold[word] == target or gold[target] == word:
                return True
        return False

    valid = get_valid_moves(n0, n, len(stack))
    if not stack or (SHIFT in valid and gold[n0] == stack[-1]):
        return [SHIFT]
    if gold[stack[-1]] == n0:
        return [LEFT]
    costly = set([m for m in MOVES if m not in valid])
    # If the word behind s0 is its gold head, Left is incorrect
    if len(stack) >= 2 and gold[stack[-1]] == stack[-2]:
        costly.add(LEFT)
    # If there are any dependencies between n0 and the stack,
    # pushing n0 will lose them.
    if SHIFT not in costly and deps_between(n0, stack, gold):
        costly.add(SHIFT)
    # If there are any dependencies between s0 and the buffer, popping
    # s0 will lose them.
    if deps_between(stack[-1], range(n0+1, n-1), gold):
        costly.add(LEFT)
        costly.add(RIGHT)
    return [m for m in MOVES if m not in costly]


def extract_features(words, tags, n0, n, stack, parse):
    def get_stack_context(depth, stack, data):
        if depth >= 3:
            return data[stack[-1]], data[stack[-2]], data[stack[-3]]
        elif depth >= 2:
            return data[stack[-1]], data[stack[-2]], ''
        elif depth == 1:
            return data[stack[-1]], '', ''
        else:
            return '', '', ''

    def get_buffer_context(i, n, data):
        if i + 1 >= n:
            return data[i], '', ''
        elif i + 2 >= n:
            return data[i], data[i + 1], ''
        else:
            return data[i], data[i + 1], data[i + 2]

    def get_parse_context(word, deps, data):
        if word == -1:
            return 0, '', ''
        deps = deps[word]
        valency = len(deps)
        if not valency:
            return 0, '', ''
        elif valency == 1:
            return 1, data[deps[-1]], ''
        else:
            return valency, data[deps[-1]], data[deps[-2]]

    features = {}
    # Set up the context pieces --- the word (W) and tag (T) of:
    # S0-2: Top three words on the stack
    # N0-2: First three words of the buffer
    # n0b1, n0b2: Two leftmost children of the first word of the buffer
    # s0b1, s0b2: Two leftmost children of the top word of the stack
    # s0f1, s0f2: Two rightmost children of the top word of the stack

    depth = len(stack)
    s0 = stack[-1] if depth else -1

    Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words)
    Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags)

    Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words)
    Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags)

    Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words)
    Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags)

    Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words)
    _, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags)

    Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words)
    _, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags)

    Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words)
    _, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags)

    # Cap numeric features at 5?
    # String-distance
    Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0

    features['bias'] = 1
    # Add word and tag unigrams
    for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2):
        if w:
            features['w=%s' % w] = 1
    for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2):
        if t:
            features['t=%s' % t] = 1

    # Add word/tag pairs
    for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))):
        if w or t:
            features['%d w=%s, t=%s' % (i, w, t)] = 1

    # Add some bigrams
    features['s0w=%s,  n0w=%s' % (Ws0, Wn0)] = 1
    features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1
    features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1
    features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1
    features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1
    features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1
    features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1
    features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1

    # Add some tag trigrams
    trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0),
                (Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1),
                (Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2),
                (Ts0, Ts1, Ts1))
    for i, (t1, t2, t3) in enumerate(trigrams):
        if t1 or t2 or t3:
            features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1

    # Add some valency and distance features
    vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b))
    vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b))
    d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0),
         ('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0))
    for i, (w_t, v_d) in enumerate(vw + vt + d):
        if w_t or v_d:
            features['val/d-%d %s %d' % (i, w_t, v_d)] = 1
    return features


class Perceptron(object):
    def __init__(self, classes=None):
        # Each feature gets its own weight vector, so weights is a dict-of-arrays
        self.classes = classes
        self.weights = {}
        # The accumulated values, for the averaging. These will be keyed by
        # feature/clas tuples
        self._totals = defaultdict(int)
        # The last time the feature was changed, for the averaging. Also
        # keyed by feature/clas tuples
        # (tstamps is short for timestamps)
        self._tstamps = defaultdict(int)
        # Number of instances seen
        self.i = 0

    def predict(self, features):
        '''Dot-product the features and current weights and return the best class.'''
        scores = self.score(features)
        # Do a secondary alphabetic sort, for stability
        return max(self.classes, key=lambda clas: (scores[clas], clas))

    def score(self, features):
        all_weights = self.weights
        scores = dict((clas, 0) for clas in self.classes)
        for feat, value in features.items():
            if value == 0:
                continue
            if feat not in all_weights:
                continue
            weights = all_weights[feat]
            for clas, weight in weights.items():
                scores[clas] += value * weight
        return scores

    def update(self, truth, guess, features):
        def upd_feat(c, f, w, v):
            param = (f, c)
            self._totals[param] += (self.i - self._tstamps[param]) * w
            self._tstamps[param] = self.i
            self.weights[f][c] = w + v

        self.i += 1
        if truth == guess:
            return None
        for f in features:
            weights = self.weights.setdefault(f, {})
            upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
            upd_feat(guess, f, weights.get(guess, 0.0), -1.0)

    def average_weights(self):
        for feat, weights in self.weights.items():
            new_feat_weights = {}
            for clas, weight in weights.items():
                param = (feat, clas)
                total = self._totals[param]
                total += (self.i - self._tstamps[param]) * weight
                averaged = round(total / float(self.i), 3)
                if averaged:
                    new_feat_weights[clas] = averaged
            self.weights[feat] = new_feat_weights

    def save(self, path):
        print "Saving model to %s" % path
        pickle.dump(self.weights, open(path, 'w'))

    def load(self, path):
        self.weights = pickle.load(open(path))


class PerceptronTagger(object):
    '''Greedy Averaged Perceptron tagger'''
    model_loc = os.path.join(os.path.dirname(__file__), 'tagger.pickle')
    def __init__(self, classes=None, load=True):
        self.tagdict = {}
        if classes:
            self.classes = classes
        else:
            self.classes = set()
        self.model = Perceptron(self.classes)
        if load:
            self.load(PerceptronTagger.model_loc)

    def tag(self, words, tokenize=True):
        prev, prev2 = START
        tags = DefaultList('')
        context = START + [self._normalize(w) for w in words] + END
        for i, word in enumerate(words):
            tag = self.tagdict.get(word)
            if not tag:
                features = self._get_features(i, word, context, prev, prev2)
                tag = self.model.predict(features)
            tags.append(tag)
            prev2 = prev; prev = tag
        return tags

    def start_training(self, sentences):
        self._make_tagdict(sentences)
        self.model = Perceptron(self.classes)

    def train(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from sentences, and save it at save_loc. nr_iter
        controls the number of Perceptron training iterations.'''
        self.start_training(sentences)
        for iter_ in range(nr_iter):
            for words, tags in sentences:
                self.train_one(words, tags)
            random.shuffle(sentences)
        self.end_training(save_loc)

    def save(self):
        # Pickle as a binary file
        pickle.dump((self.model.weights, self.tagdict, self.classes),
                    open(PerceptronTagger.model_loc, 'wb'), -1)

    def train_one(self, words, tags):
        prev, prev2 = START
        context = START + [self._normalize(w) for w in words] + END
        for i, word in enumerate(words):
            guess = self.tagdict.get(word)
            if not guess:
                feats = self._get_features(i, word, context, prev, prev2)
                guess = self.model.predict(feats)
                self.model.update(tags[i], guess, feats)
            prev2 = prev; prev = guess

    def load(self, loc):
        w_td_c = pickle.load(open(loc, 'rb'))
        self.model.weights, self.tagdict, self.classes = w_td_c
        self.model.classes = self.classes

    def _normalize(self, word):
        if '-' in word and word[0] != '-':
            return '!HYPHEN'
        elif word.isdigit() and len(word) == 4:
            return '!YEAR'
        elif word[0].isdigit():
            return '!DIGITS'
        else:
            return word.lower()

    def _get_features(self, i, word, context, prev, prev2):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.'''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        add('i-1 tag', prev)
        add('i-2 tag', prev2)
        add('i tag+i-2 tag', prev, prev2)
        add('i word', context[i])
        add('i-1 tag+i word', prev, context[i])
        add('i-1 word', context[i-1])
        add('i-1 suffix', context[i-1][-3:])
        add('i-2 word', context[i-2])
        add('i+1 word', context[i+1])
        add('i+1 suffix', context[i+1][-3:])
        add('i+2 word', context[i+2])
        return features

    def _make_tagdict(self, sentences):
        '''Make a tag dictionary for single-tag words.'''
        counts = defaultdict(lambda: defaultdict(int))
        for sent in sentences:
            for word, tag in zip(sent[0], sent[1]):
                counts[word][tag] += 1
                self.classes.add(tag)
        freq_thresh = 20
        ambiguity_thresh = 0.97
        for word, tag_freqs in counts.items():
            tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
            n = sum(tag_freqs.values())
            # Don't add rare words to the tag dictionary
            # Only add quite unambiguous words
            if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
                self.tagdict[word] = tag

def _pc(n, d):
    return (float(n) / d) * 100


def train(parser, sentences, nr_iter):
    parser.tagger.start_training(sentences)
    for itn in range(nr_iter):
        corr = 0; total = 0
        random.shuffle(sentences)
        for words, gold_tags, gold_parse, gold_label in sentences:
            corr += parser.train_one(itn, words, gold_tags, gold_parse)
            if itn < 5:
                parser.tagger.train_one(words, gold_tags)
            total += len(words)
        print itn, '%.3f' % (float(corr) / float(total))
        if itn == 4:
            parser.tagger.model.average_weights()
    print 'Averaging weights'
    parser.model.average_weights()

def read_pos(loc):
    for line in open(loc):
        if not line.strip():
            continue
        words = DefaultList('')
        tags = DefaultList('')
        for token in line.split():
            if not token:
                continue
            word, tag = token.rsplit('/', 1)
            #words.append(normalize(word))
            words.append(word)
            tags.append(tag)
        pad_tokens(words); pad_tokens(tags)
        yield words, tags


def read_conll(loc):
    for sent_str in open(loc).read().strip().split('\n\n'):
        lines = [line.split() for line in sent_str.split('\n')]
        words = DefaultList(''); tags = DefaultList('')
        heads = [None]; labels = [None]
        for i, (word, pos, head, label) in enumerate(lines):
            words.append(intern(word))
            #words.append(intern(normalize(word)))
            tags.append(intern(pos))
            heads.append(int(head) + 1 if head != '-1' else len(lines) + 1)
            labels.append(label)
        pad_tokens(words); pad_tokens(tags)
        yield words, tags, heads, labels


def pad_tokens(tokens):
    tokens.insert(0, '<start>')
    tokens.append('ROOT')


def main(model_dir, train_loc, heldout_in, heldout_gold):
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)

    input_sents = list(read_pos(heldout_in))
    parser = Parser(load=False)
    sentences = list(read_conll(train_loc))
    train(parser, sentences, nr_iter=15)
    parser.save()
    c = 0
    t = 0
    gold_sents = list(read_conll(heldout_gold))
    t1 = time.time()
    for (words, tags), (_, _, gold_heads, gold_labels) in zip(input_sents, gold_sents):
        _, heads = parser.parse(words)
        for i, w in list(enumerate(words))[1:-1]:
            if gold_labels[i] in ('P', 'punct'):
                continue
            if heads[i] == gold_heads[i]:
                c += 1
            t += 1
    t2 = time.time()
    print 'Parsing took %0.3f ms' % ((t2-t1)*1000.0)
    print c, t, float(c)/t


if __name__ == '__main__':
    main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
	"""A simple implementation of a greedy transition-based parser. Released under BSD license."""
	from os import path
	import os
	import sys
	from collections import defaultdict
	import random
	import time
	import pickle

	SHIFT = 0; RIGHT = 1; LEFT = 2;
	MOVES = (SHIFT, RIGHT, LEFT)
	START = ['-START-', '-START2-']
	END = ['-END-', '-END2-']


	class DefaultList(list):
	"""A list that returns a default value if index out of bounds."""
	def __init__(self, default=None):
	self.default = default
	list.__init__(self)

	def __getitem__(self, index):
	try:
	return list.__getitem__(self, index)
	except IndexError:
	return self.default


	class Parse(object):
	def __init__(self, n):
	self.n = n
	self.heads = [None] * (n-1)
	self.labels = [None] * (n-1)
	self.lefts = []
	self.rights = []
	for i in range(n+1):
	self.lefts.append(DefaultList(0))
	self.rights.append(DefaultList(0))

	def add(self, head, child, label=None):
	self.heads[child] = head
	self.labels[child] = label
	if child < head:
	self.lefts[head].append(child)
	else:
	self.rights[head].append(child)


	class Parser(object):
	def __init__(self, load=True):
	model_dir = os.path.dirname(__file__)
	self.model = Perceptron(MOVES)
	if load:
	self.model.load(path.join(model_dir, 'parser.pickle'))
	self.tagger = PerceptronTagger(load=load)
	self.confusion_matrix = defaultdict(lambda: defaultdict(int))

	def save(self):
	self.model.save(path.join(os.path.dirname(__file__), 'parser.pickle'))
	self.tagger.save()

	def parse(self, words):
	n = len(words)
	i = 2; stack = [1]; parse = Parse(n)
	tags = self.tagger.tag(words)
	while stack or (i+1) < n:
	features = extract_features(words, tags, i, n, stack, parse)
	scores = self.model.score(features)
	valid_moves = get_valid_moves(i, n, len(stack))
	guess = max(valid_moves, key=lambda move: scores[move])
	i = transition(guess, i, stack, parse)
	return tags, parse.heads

	def train_one(self, itn, words, gold_tags, gold_heads):
	n = len(words)
	i = 2; stack = [1]; parse = Parse(n)
	tags = self.tagger.tag(words)
	while stack or (i + 1) < n:
	features = extract_features(words, tags, i, n, stack, parse)
	scores = self.model.score(features)
	valid_moves = get_valid_moves(i, n, len(stack))
	gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads)
	guess = max(valid_moves, key=lambda move: scores[move])
	assert gold_moves
	best = max(gold_moves, key=lambda move: scores[move])
	self.model.update(best, guess, features)
	i = transition(guess, i, stack, parse)
	self.confusion_matrix[best][guess] += 1
	return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]])


	def transition(move, i, stack, parse):
	if move == SHIFT:
	stack.append(i)
	return i + 1
	elif move == RIGHT:
	parse.add(stack[-2], stack.pop())
	return i
	elif move == LEFT:
	parse.add(i, stack.pop())
	return i
	assert move in MOVES


	def get_valid_moves(i, n, stack_depth):
	moves = []
	if (i+1) < n:
	moves.append(SHIFT)
	if stack_depth >= 2:
	moves.append(RIGHT)
	if stack_depth >= 1:
	moves.append(LEFT)
	return moves


	def get_gold_moves(n0, n, stack, heads, gold):
	def deps_between(target, others, gold):
	for word in others:
	if gold[word] == target or gold[target] == word:
	return True
	return False

	valid = get_valid_moves(n0, n, len(stack))
	if not stack or (SHIFT in valid and gold[n0] == stack[-1]):
	return [SHIFT]
	if gold[stack[-1]] == n0:
	return [LEFT]
	costly = set([m for m in MOVES if m not in valid])
	# If the word behind s0 is its gold head, Left is incorrect
	if len(stack) >= 2 and gold[stack[-1]] == stack[-2]:
	costly.add(LEFT)
	# If there are any dependencies between n0 and the stack,
	# pushing n0 will lose them.
	if SHIFT not in costly and deps_between(n0, stack, gold):
	costly.add(SHIFT)
	# If there are any dependencies between s0 and the buffer, popping
	# s0 will lose them.
	if deps_between(stack[-1], range(n0+1, n-1), gold):
	costly.add(LEFT)
	costly.add(RIGHT)
	return [m for m in MOVES if m not in costly]


	def extract_features(words, tags, n0, n, stack, parse):
	def get_stack_context(depth, stack, data):
	if depth >= 3:
	return data[stack[-1]], data[stack[-2]], data[stack[-3]]
	elif depth >= 2:
	return data[stack[-1]], data[stack[-2]], ''
	elif depth == 1:
	return data[stack[-1]], '', ''
	else:
	return '', '', ''

	def get_buffer_context(i, n, data):
	if i + 1 >= n:
	return data[i], '', ''
	elif i + 2 >= n:
	return data[i], data[i + 1], ''
	else:
	return data[i], data[i + 1], data[i + 2]

	def get_parse_context(word, deps, data):
	if word == -1:
	return 0, '', ''
	deps = deps[word]
	valency = len(deps)
	if not valency:
	return 0, '', ''
	elif valency == 1:
	return 1, data[deps[-1]], ''
	else:
	return valency, data[deps[-1]], data[deps[-2]]

	features = {}
	# Set up the context pieces --- the word (W) and tag (T) of:
	# S0-2: Top three words on the stack
	# N0-2: First three words of the buffer
	# n0b1, n0b2: Two leftmost children of the first word of the buffer
	# s0b1, s0b2: Two leftmost children of the top word of the stack
	# s0f1, s0f2: Two rightmost children of the top word of the stack

	depth = len(stack)
	s0 = stack[-1] if depth else -1

	Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words)
	Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags)

	Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words)
	Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags)

	Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words)
	Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags)

	Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words)
	_, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags)

	Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words)
	_, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags)

	Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words)
	_, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags)

	# Cap numeric features at 5?
	# String-distance
	Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0

	features['bias'] = 1
	# Add word and tag unigrams
	for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2):
	if w:
	features['w=%s' % w] = 1
	for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2):
	if t:
	features['t=%s' % t] = 1

	# Add word/tag pairs
	for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))):
	if w or t:
	features['%d w=%s, t=%s' % (i, w, t)] = 1

	# Add some bigrams
	features['s0w=%s, n0w=%s' % (Ws0, Wn0)] = 1
	features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1
	features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1
	features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1
	features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1
	features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1
	features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1
	features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1

	# Add some tag trigrams
	trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0),
	(Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1),
	(Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2),
	(Ts0, Ts1, Ts1))
	for i, (t1, t2, t3) in enumerate(trigrams):
	if t1 or t2 or t3:
	features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1

	# Add some valency and distance features
	vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b))
	vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b))
	d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0),
	('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0))
	for i, (w_t, v_d) in enumerate(vw + vt + d):
	if w_t or v_d:
	features['val/d-%d %s %d' % (i, w_t, v_d)] = 1
	return features


	class Perceptron(object):
	def __init__(self, classes=None):
	# Each feature gets its own weight vector, so weights is a dict-of-arrays
	self.classes = classes
	self.weights = {}
	# The accumulated values, for the averaging. These will be keyed by
	# feature/clas tuples
	self._totals = defaultdict(int)
	# The last time the feature was changed, for the averaging. Also
	# keyed by feature/clas tuples
	# (tstamps is short for timestamps)
	self._tstamps = defaultdict(int)
	# Number of instances seen
	self.i = 0

	def predict(self, features):
	'''Dot-product the features and current weights and return the best class.'''
	scores = self.score(features)
	# Do a secondary alphabetic sort, for stability
	return max(self.classes, key=lambda clas: (scores[clas], clas))

	def score(self, features):
	all_weights = self.weights
	scores = dict((clas, 0) for clas in self.classes)
	for feat, value in features.items():
	if value == 0:
	continue
	if feat not in all_weights:
	continue
	weights = all_weights[feat]
	for clas, weight in weights.items():
	scores[clas] += value * weight
	return scores

	def update(self, truth, guess, features):
	def upd_feat(c, f, w, v):
	param = (f, c)
	self._totals[param] += (self.i - self._tstamps[param]) * w
	self._tstamps[param] = self.i
	self.weights[f][c] = w + v

	self.i += 1
	if truth == guess:
	return None
	for f in features:
	weights = self.weights.setdefault(f, {})
	upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
	upd_feat(guess, f, weights.get(guess, 0.0), -1.0)

	def average_weights(self):
	for feat, weights in self.weights.items():
	new_feat_weights = {}
	for clas, weight in weights.items():
	param = (feat, clas)
	total = self._totals[param]
	total += (self.i - self._tstamps[param]) * weight
	averaged = round(total / float(self.i), 3)
	if averaged:
	new_feat_weights[clas] = averaged
	self.weights[feat] = new_feat_weights

	def save(self, path):
	print "Saving model to %s" % path
	pickle.dump(self.weights, open(path, 'w'))

	def load(self, path):
	self.weights = pickle.load(open(path))


	class PerceptronTagger(object):
	'''Greedy Averaged Perceptron tagger'''
	model_loc = os.path.join(os.path.dirname(__file__), 'tagger.pickle')
	def __init__(self, classes=None, load=True):
	self.tagdict = {}
	if classes:
	self.classes = classes
	else:
	self.classes = set()
	self.model = Perceptron(self.classes)
	if load:
	self.load(PerceptronTagger.model_loc)

	def tag(self, words, tokenize=True):
	prev, prev2 = START
	tags = DefaultList('')
	context = START + [self._normalize(w) for w in words] + END
	for i, word in enumerate(words):
	tag = self.tagdict.get(word)
	if not tag:
	features = self._get_features(i, word, context, prev, prev2)
	tag = self.model.predict(features)
	tags.append(tag)
	prev2 = prev; prev = tag
	return tags

	def start_training(self, sentences):
	self._make_tagdict(sentences)
	self.model = Perceptron(self.classes)

	def train(self, sentences, save_loc=None, nr_iter=5):
	'''Train a model from sentences, and save it at save_loc. nr_iter
	controls the number of Perceptron training iterations.'''
	self.start_training(sentences)
	for iter_ in range(nr_iter):
	for words, tags in sentences:
	self.train_one(words, tags)
	random.shuffle(sentences)
	self.end_training(save_loc)

	def save(self):
	# Pickle as a binary file
	pickle.dump((self.model.weights, self.tagdict, self.classes),
	open(PerceptronTagger.model_loc, 'wb'), -1)

	def train_one(self, words, tags):
	prev, prev2 = START
	context = START + [self._normalize(w) for w in words] + END
	for i, word in enumerate(words):
	guess = self.tagdict.get(word)
	if not guess:
	feats = self._get_features(i, word, context, prev, prev2)
	guess = self.model.predict(feats)
	self.model.update(tags[i], guess, feats)
	prev2 = prev; prev = guess

	def load(self, loc):
	w_td_c = pickle.load(open(loc, 'rb'))
	self.model.weights, self.tagdict, self.classes = w_td_c
	self.model.classes = self.classes

	def _normalize(self, word):
	if '-' in word and word[0] != '-':
	return '!HYPHEN'
	elif word.isdigit() and len(word) == 4:
	return '!YEAR'
	elif word[0].isdigit():
	return '!DIGITS'
	else:
	return word.lower()

	def _get_features(self, i, word, context, prev, prev2):
	'''Map tokens into a feature representation, implemented as a
	{hashable: float} dict. If the features change, a new model must be
	trained.'''
	def add(name, *args):
	features[' '.join((name,) + tuple(args))] += 1

	i += len(START)
	features = defaultdict(int)
	# It's useful to have a constant feature, which acts sort of like a prior
	add('bias')
	add('i suffix', word[-3:])
	add('i pref1', word[0])
	add('i-1 tag', prev)
	add('i-2 tag', prev2)
	add('i tag+i-2 tag', prev, prev2)
	add('i word', context[i])
	add('i-1 tag+i word', prev, context[i])
	add('i-1 word', context[i-1])
	add('i-1 suffix', context[i-1][-3:])
	add('i-2 word', context[i-2])
	add('i+1 word', context[i+1])
	add('i+1 suffix', context[i+1][-3:])
	add('i+2 word', context[i+2])
	return features

	def _make_tagdict(self, sentences):
	'''Make a tag dictionary for single-tag words.'''
	counts = defaultdict(lambda: defaultdict(int))
	for sent in sentences:
	for word, tag in zip(sent[0], sent[1]):
	counts[word][tag] += 1
	self.classes.add(tag)
	freq_thresh = 20
	ambiguity_thresh = 0.97
	for word, tag_freqs in counts.items():
	tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
	n = sum(tag_freqs.values())
	# Don't add rare words to the tag dictionary
	# Only add quite unambiguous words
	if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
	self.tagdict[word] = tag

	def _pc(n, d):
	return (float(n) / d) * 100


	def train(parser, sentences, nr_iter):
	parser.tagger.start_training(sentences)
	for itn in range(nr_iter):
	corr = 0; total = 0
	random.shuffle(sentences)
	for words, gold_tags, gold_parse, gold_label in sentences:
	corr += parser.train_one(itn, words, gold_tags, gold_parse)
	if itn < 5:
	parser.tagger.train_one(words, gold_tags)
	total += len(words)
	print itn, '%.3f' % (float(corr) / float(total))
	if itn == 4:
	parser.tagger.model.average_weights()
	print 'Averaging weights'
	parser.model.average_weights()

	def read_pos(loc):
	for line in open(loc):
	if not line.strip():
	continue
	words = DefaultList('')
	tags = DefaultList('')
	for token in line.split():
	if not token:
	continue
	word, tag = token.rsplit('/', 1)
	#words.append(normalize(word))
	words.append(word)
	tags.append(tag)
	pad_tokens(words); pad_tokens(tags)
	yield words, tags


	def read_conll(loc):
	for sent_str in open(loc).read().strip().split('\n\n'):
	lines = [line.split() for line in sent_str.split('\n')]
	words = DefaultList(''); tags = DefaultList('')
	heads = [None]; labels = [None]
	for i, (word, pos, head, label) in enumerate(lines):
	words.append(intern(word))
	#words.append(intern(normalize(word)))
	tags.append(intern(pos))
	heads.append(int(head) + 1 if head != '-1' else len(lines) + 1)
	labels.append(label)
	pad_tokens(words); pad_tokens(tags)
	yield words, tags, heads, labels


	def pad_tokens(tokens):
	tokens.insert(0, '<start>')
	tokens.append('ROOT')


	def main(model_dir, train_loc, heldout_in, heldout_gold):
	if not os.path.exists(model_dir):
	os.mkdir(model_dir)

	input_sents = list(read_pos(heldout_in))
	parser = Parser(load=False)
	sentences = list(read_conll(train_loc))
	train(parser, sentences, nr_iter=15)
	parser.save()
	c = 0
	t = 0
	gold_sents = list(read_conll(heldout_gold))
	t1 = time.time()
	for (words, tags), (_, _, gold_heads, gold_labels) in zip(input_sents, gold_sents):
	_, heads = parser.parse(words)
	for i, w in list(enumerate(words))[1:-1]:
	if gold_labels[i] in ('P', 'punct'):
	continue
	if heads[i] == gold_heads[i]:
	c += 1
	t += 1
	t2 = time.time()
	print 'Parsing took %0.3f ms' % ((t2-t1)*1000.0)
	print c, t, float(c)/t


	if __name__ == '__main__':
	main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])