Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
A simple Python dependency parser
"""A simple implementation of a greedy transition-based parser. Released under BSD license."""
from os import path
import os
import sys
from collections import defaultdict
import random
import time
import pickle
SHIFT = 0; RIGHT = 1; LEFT = 2;
MOVES = (SHIFT, RIGHT, LEFT)
START = ['-START-', '-START2-']
END = ['-END-', '-END2-']
class DefaultList(list):
"""A list that returns a default value if index out of bounds."""
def __init__(self, default=None):
self.default = default
list.__init__(self)
def __getitem__(self, index):
try:
return list.__getitem__(self, index)
except IndexError:
return self.default
class Parse(object):
def __init__(self, n):
self.n = n
self.heads = [None] * (n-1)
self.labels = [None] * (n-1)
self.lefts = []
self.rights = []
for i in range(n+1):
self.lefts.append(DefaultList(0))
self.rights.append(DefaultList(0))
def add(self, head, child, label=None):
self.heads[child] = head
self.labels[child] = label
if child < head:
self.lefts[head].append(child)
else:
self.rights[head].append(child)
class Parser(object):
def __init__(self, load=True):
model_dir = os.path.dirname(__file__)
self.model = Perceptron(MOVES)
if load:
self.model.load(path.join(model_dir, 'parser.pickle'))
self.tagger = PerceptronTagger(load=load)
self.confusion_matrix = defaultdict(lambda: defaultdict(int))
def save(self):
self.model.save(path.join(os.path.dirname(__file__), 'parser.pickle'))
self.tagger.save()
def parse(self, words):
n = len(words)
i = 2; stack = [1]; parse = Parse(n)
tags = self.tagger.tag(words)
while stack or (i+1) < n:
features = extract_features(words, tags, i, n, stack, parse)
scores = self.model.score(features)
valid_moves = get_valid_moves(i, n, len(stack))
guess = max(valid_moves, key=lambda move: scores[move])
i = transition(guess, i, stack, parse)
return tags, parse.heads
def train_one(self, itn, words, gold_tags, gold_heads):
n = len(words)
i = 2; stack = [1]; parse = Parse(n)
tags = self.tagger.tag(words)
while stack or (i + 1) < n:
features = extract_features(words, tags, i, n, stack, parse)
scores = self.model.score(features)
valid_moves = get_valid_moves(i, n, len(stack))
gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads)
guess = max(valid_moves, key=lambda move: scores[move])
assert gold_moves
best = max(gold_moves, key=lambda move: scores[move])
self.model.update(best, guess, features)
i = transition(guess, i, stack, parse)
self.confusion_matrix[best][guess] += 1
return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]])
def transition(move, i, stack, parse):
if move == SHIFT:
stack.append(i)
return i + 1
elif move == RIGHT:
parse.add(stack[-2], stack.pop())
return i
elif move == LEFT:
parse.add(i, stack.pop())
return i
assert move in MOVES
def get_valid_moves(i, n, stack_depth):
moves = []
if (i+1) < n:
moves.append(SHIFT)
if stack_depth >= 2:
moves.append(RIGHT)
if stack_depth >= 1:
moves.append(LEFT)
return moves
def get_gold_moves(n0, n, stack, heads, gold):
def deps_between(target, others, gold):
for word in others:
if gold[word] == target or gold[target] == word:
return True
return False
valid = get_valid_moves(n0, n, len(stack))
if not stack or (SHIFT in valid and gold[n0] == stack[-1]):
return [SHIFT]
if gold[stack[-1]] == n0:
return [LEFT]
costly = set([m for m in MOVES if m not in valid])
# If the word behind s0 is its gold head, Left is incorrect
if len(stack) >= 2 and gold[stack[-1]] == stack[-2]:
costly.add(LEFT)
# If there are any dependencies between n0 and the stack,
# pushing n0 will lose them.
if SHIFT not in costly and deps_between(n0, stack, gold):
costly.add(SHIFT)
# If there are any dependencies between s0 and the buffer, popping
# s0 will lose them.
if deps_between(stack[-1], range(n0+1, n-1), gold):
costly.add(LEFT)
costly.add(RIGHT)
return [m for m in MOVES if m not in costly]
def extract_features(words, tags, n0, n, stack, parse):
def get_stack_context(depth, stack, data):
if depth >= 3:
return data[stack[-1]], data[stack[-2]], data[stack[-3]]
elif depth >= 2:
return data[stack[-1]], data[stack[-2]], ''
elif depth == 1:
return data[stack[-1]], '', ''
else:
return '', '', ''
def get_buffer_context(i, n, data):
if i + 1 >= n:
return data[i], '', ''
elif i + 2 >= n:
return data[i], data[i + 1], ''
else:
return data[i], data[i + 1], data[i + 2]
def get_parse_context(word, deps, data):
if word == -1:
return 0, '', ''
deps = deps[word]
valency = len(deps)
if not valency:
return 0, '', ''
elif valency == 1:
return 1, data[deps[-1]], ''
else:
return valency, data[deps[-1]], data[deps[-2]]
features = {}
# Set up the context pieces --- the word (W) and tag (T) of:
# S0-2: Top three words on the stack
# N0-2: First three words of the buffer
# n0b1, n0b2: Two leftmost children of the first word of the buffer
# s0b1, s0b2: Two leftmost children of the top word of the stack
# s0f1, s0f2: Two rightmost children of the top word of the stack
depth = len(stack)
s0 = stack[-1] if depth else -1
Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words)
Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags)
Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words)
Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags)
Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words)
Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags)
Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words)
_, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags)
Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words)
_, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags)
Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words)
_, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags)
# Cap numeric features at 5?
# String-distance
Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0
features['bias'] = 1
# Add word and tag unigrams
for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2):
if w:
features['w=%s' % w] = 1
for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2):
if t:
features['t=%s' % t] = 1
# Add word/tag pairs
for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))):
if w or t:
features['%d w=%s, t=%s' % (i, w, t)] = 1
# Add some bigrams
features['s0w=%s, n0w=%s' % (Ws0, Wn0)] = 1
features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1
features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1
features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1
features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1
features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1
features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1
features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1
# Add some tag trigrams
trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0),
(Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1),
(Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2),
(Ts0, Ts1, Ts1))
for i, (t1, t2, t3) in enumerate(trigrams):
if t1 or t2 or t3:
features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1
# Add some valency and distance features
vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b))
vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b))
d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0),
('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0))
for i, (w_t, v_d) in enumerate(vw + vt + d):
if w_t or v_d:
features['val/d-%d %s %d' % (i, w_t, v_d)] = 1
return features
class Perceptron(object):
def __init__(self, classes=None):
# Each feature gets its own weight vector, so weights is a dict-of-arrays
self.classes = classes
self.weights = {}
# The accumulated values, for the averaging. These will be keyed by
# feature/clas tuples
self._totals = defaultdict(int)
# The last time the feature was changed, for the averaging. Also
# keyed by feature/clas tuples
# (tstamps is short for timestamps)
self._tstamps = defaultdict(int)
# Number of instances seen
self.i = 0
def predict(self, features):
'''Dot-product the features and current weights and return the best class.'''
scores = self.score(features)
# Do a secondary alphabetic sort, for stability
return max(self.classes, key=lambda clas: (scores[clas], clas))
def score(self, features):
all_weights = self.weights
scores = dict((clas, 0) for clas in self.classes)
for feat, value in features.items():
if value == 0:
continue
if feat not in all_weights:
continue
weights = all_weights[feat]
for clas, weight in weights.items():
scores[clas] += value * weight
return scores
def update(self, truth, guess, features):
def upd_feat(c, f, w, v):
param = (f, c)
self._totals[param] += (self.i - self._tstamps[param]) * w
self._tstamps[param] = self.i
self.weights[f][c] = w + v
self.i += 1
if truth == guess:
return None
for f in features:
weights = self.weights.setdefault(f, {})
upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
def average_weights(self):
for feat, weights in self.weights.items():
new_feat_weights = {}
for clas, weight in weights.items():
param = (feat, clas)
total = self._totals[param]
total += (self.i - self._tstamps[param]) * weight
averaged = round(total / float(self.i), 3)
if averaged:
new_feat_weights[clas] = averaged
self.weights[feat] = new_feat_weights
def save(self, path):
print "Saving model to %s" % path
pickle.dump(self.weights, open(path, 'w'))
def load(self, path):
self.weights = pickle.load(open(path))
class PerceptronTagger(object):
'''Greedy Averaged Perceptron tagger'''
model_loc = os.path.join(os.path.dirname(__file__), 'tagger.pickle')
def __init__(self, classes=None, load=True):
self.tagdict = {}
if classes:
self.classes = classes
else:
self.classes = set()
self.model = Perceptron(self.classes)
if load:
self.load(PerceptronTagger.model_loc)
def tag(self, words, tokenize=True):
prev, prev2 = START
tags = DefaultList('')
context = START + [self._normalize(w) for w in words] + END
for i, word in enumerate(words):
tag = self.tagdict.get(word)
if not tag:
features = self._get_features(i, word, context, prev, prev2)
tag = self.model.predict(features)
tags.append(tag)
prev2 = prev; prev = tag
return tags
def start_training(self, sentences):
self._make_tagdict(sentences)
self.model = Perceptron(self.classes)
def train(self, sentences, save_loc=None, nr_iter=5):
'''Train a model from sentences, and save it at save_loc. nr_iter
controls the number of Perceptron training iterations.'''
self.start_training(sentences)
for iter_ in range(nr_iter):
for words, tags in sentences:
self.train_one(words, tags)
random.shuffle(sentences)
self.end_training(save_loc)
def save(self):
# Pickle as a binary file
pickle.dump((self.model.weights, self.tagdict, self.classes),
open(PerceptronTagger.model_loc, 'wb'), -1)
def train_one(self, words, tags):
prev, prev2 = START
context = START + [self._normalize(w) for w in words] + END
for i, word in enumerate(words):
guess = self.tagdict.get(word)
if not guess:
feats = self._get_features(i, word, context, prev, prev2)
guess = self.model.predict(feats)
self.model.update(tags[i], guess, feats)
prev2 = prev; prev = guess
def load(self, loc):
w_td_c = pickle.load(open(loc, 'rb'))
self.model.weights, self.tagdict, self.classes = w_td_c
self.model.classes = self.classes
def _normalize(self, word):
if '-' in word and word[0] != '-':
return '!HYPHEN'
elif word.isdigit() and len(word) == 4:
return '!YEAR'
elif word[0].isdigit():
return '!DIGITS'
else:
return word.lower()
def _get_features(self, i, word, context, prev, prev2):
'''Map tokens into a feature representation, implemented as a
{hashable: float} dict. If the features change, a new model must be
trained.'''
def add(name, *args):
features[' '.join((name,) + tuple(args))] += 1
i += len(START)
features = defaultdict(int)
# It's useful to have a constant feature, which acts sort of like a prior
add('bias')
add('i suffix', word[-3:])
add('i pref1', word[0])
add('i-1 tag', prev)
add('i-2 tag', prev2)
add('i tag+i-2 tag', prev, prev2)
add('i word', context[i])
add('i-1 tag+i word', prev, context[i])
add('i-1 word', context[i-1])
add('i-1 suffix', context[i-1][-3:])
add('i-2 word', context[i-2])
add('i+1 word', context[i+1])
add('i+1 suffix', context[i+1][-3:])
add('i+2 word', context[i+2])
return features
def _make_tagdict(self, sentences):
'''Make a tag dictionary for single-tag words.'''
counts = defaultdict(lambda: defaultdict(int))
for sent in sentences:
for word, tag in zip(sent[0], sent[1]):
counts[word][tag] += 1
self.classes.add(tag)
freq_thresh = 20
ambiguity_thresh = 0.97
for word, tag_freqs in counts.items():
tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
n = sum(tag_freqs.values())
# Don't add rare words to the tag dictionary
# Only add quite unambiguous words
if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
self.tagdict[word] = tag
def _pc(n, d):
return (float(n) / d) * 100
def train(parser, sentences, nr_iter):
parser.tagger.start_training(sentences)
for itn in range(nr_iter):
corr = 0; total = 0
random.shuffle(sentences)
for words, gold_tags, gold_parse, gold_label in sentences:
corr += parser.train_one(itn, words, gold_tags, gold_parse)
if itn < 5:
parser.tagger.train_one(words, gold_tags)
total += len(words)
print itn, '%.3f' % (float(corr) / float(total))
if itn == 4:
parser.tagger.model.average_weights()
print 'Averaging weights'
parser.model.average_weights()
def read_pos(loc):
for line in open(loc):
if not line.strip():
continue
words = DefaultList('')
tags = DefaultList('')
for token in line.split():
if not token:
continue
word, tag = token.rsplit('/', 1)
#words.append(normalize(word))
words.append(word)
tags.append(tag)
pad_tokens(words); pad_tokens(tags)
yield words, tags
def read_conll(loc):
for sent_str in open(loc).read().strip().split('\n\n'):
lines = [line.split() for line in sent_str.split('\n')]
words = DefaultList(''); tags = DefaultList('')
heads = [None]; labels = [None]
for i, (word, pos, head, label) in enumerate(lines):
words.append(intern(word))
#words.append(intern(normalize(word)))
tags.append(intern(pos))
heads.append(int(head) + 1 if head != '-1' else len(lines) + 1)
labels.append(label)
pad_tokens(words); pad_tokens(tags)
yield words, tags, heads, labels
def pad_tokens(tokens):
tokens.insert(0, '<start>')
tokens.append('ROOT')
def main(model_dir, train_loc, heldout_in, heldout_gold):
if not os.path.exists(model_dir):
os.mkdir(model_dir)
input_sents = list(read_pos(heldout_in))
parser = Parser(load=False)
sentences = list(read_conll(train_loc))
train(parser, sentences, nr_iter=15)
parser.save()
c = 0
t = 0
gold_sents = list(read_conll(heldout_gold))
t1 = time.time()
for (words, tags), (_, _, gold_heads, gold_labels) in zip(input_sents, gold_sents):
_, heads = parser.parse(words)
for i, w in list(enumerate(words))[1:-1]:
if gold_labels[i] in ('P', 'punct'):
continue
if heads[i] == gold_heads[i]:
c += 1
t += 1
t2 = time.time()
print 'Parsing took %0.3f ms' % ((t2-t1)*1000.0)
print c, t, float(c)/t
if __name__ == '__main__':
main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])

Hi, thanks for your code, I am quite interested in it. Could you please clarify what the 4 command line parameters should be ? Thank you.

Where can I get parser.py file (parsing english in 500 lines of python)?

In line 359, where is end_training() defined?

lluisp commented Sep 18, 2017

I liked you post a lot, it is very clear and enlightening.

I have a question, though: I tried to train it with conll-X data (after guessing the parameters, their formats, and fixing the files... some doc here would be nice) but it always fails at "assert gold_moves" in line 84. The reason is that since the perceptron starts with random weights, during the first epoch it is very likely to get into a nonsense path and reach a configuration were all moves will be discarded.
Any suggestion how to handle this? Maybe use a static oracle during the first epochs?

Hey @lluisp

(after guessing the parameters, their formats, and fixing the files

Can you share those parameters and the doc formats?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment