Skip to content

Instantly share code, notes, and snippets.

@JonathanRaiman
Last active August 29, 2015 14:13
Show Gist options
  • Save JonathanRaiman/50ba6d79c5365c37c379 to your computer and use it in GitHub Desktop.
Save JonathanRaiman/50ba6d79c5365c37c379 to your computer and use it in GitHub Desktop.
Lattice Learning
import matplotlib.pyplot as plt
import theano, theano.tensor as T, numpy as np
import theano_lstm
import os
from collections import Counter
to_softmax = lambda x: T.nnet.softmax(x)[0] if x.ndim == 1 else T.nnet.softmax(x.T).T
class LatticeModel(object):
def __init__(self,
word_size,
vocabulary_size,
stack_size,
hidden_size,
max_branching_factor,
memory_sparsity = 0.0001,
rho = 0.95,
verbose=False,
theano_mode = "FAST_RUN"):
self.memory_sparsity= theano.shared(np.float64(memory_sparsity), name="memory_sparsity")
self.theano_mode = theano_mode
self.word_size = word_size
self.vocabulary_size = theano.shared(np.int32(vocabulary_size), name="vocabulary_size")
self.stack_size = stack_size
self.hidden_size = hidden_size
self.max_branching_factor = max_branching_factor
### CREATE THE CELLS:
model = theano_lstm.StackedCells(word_size, layers=[hidden_size] * stack_size, celltype=theano_lstm.LSTM, activation=T.tanh)
# add a softmax layer at the end (non-recurrent)
# special end token:
model.layers.append(theano_lstm.Layer(hidden_size, max_branching_factor + 2, to_softmax))
# add an embedding layer at the beginning (non-recurrent):
model.layers = [theano_lstm.Embedding(vocabulary_size + max_branching_factor + 2, word_size),
theano_lstm.GatedInput(word_size, hidden_size, T.nnet.sigmoid)] + model.layers
self.model = model
### CONSTRUCT THE PREDICTION / WIRING:
def step(word_id, *prev_hiddens):
if prev_hiddens[-1].ndim > 1:
top_level_activ = prev_hiddens[-1][:, self.hidden_size:]
else:
top_level_activ = prev_hiddens[-1][self.hidden_size:]
new_state = model.forward(word_id, [None, top_level_activ] + list(prev_hiddens), [])
# all outputs should be returned, except embeddings, and the first gates
return new_state[1:]
def pred_step(word_id, *prev_hiddens):
if prev_hiddens[-1].ndim > 1:
top_level_activ = prev_hiddens[-1][:, self.hidden_size:]
else:
top_level_activ = prev_hiddens[-1][self.hidden_size:]
new_state = model.forward(word_id, [None, top_level_activ] + list(prev_hiddens), [])
# all outputs should be returned, except embeddings, and the first gates
return [T.cast(new_state[-1].argmax() + self.vocabulary_size, dtype='int32')] + new_state[2:-1]
def predict_sequence(x, return_all=False, return_memory=False):
if x.ndim > 1:
outputs_info = [None] + [dict(initial=T.repeat(T.shape_padleft(layer.initial_hidden_state), x.shape[0], axis=0), taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')]
else:
outputs_info = [None] + [dict(initial=layer.initial_hidden_state, taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')]
outputs_info = outputs_info + [None]
result, updates = theano.scan(step,
sequences = [x.T if x.ndim > 1 else x],
outputs_info = outputs_info)
if return_all:
return result
else:
res = result[-1].dimshuffle(1, 0, 2) if x.ndim > 1 else res
# gate values can be obtained by asking for them from the stacked cells
if return_memory:
return result[0], res
else:
return res
# every sequence is a series of indices
# for words:
input_sentences = T.imatrix()
# some sequences are shorter than others, so we'll note where they
# end in a zero-indexed fashion
sequence_lengths = T.ivector()
sequence_starts = T.ivector()
# the labels are integers in the range of dictionary
batchsize = theano.shared(100, name="batchsize")
batch_index = T.iscalar()
self.batchsize = batchsize
self.input_sentences = input_sentences
self.sequence_lengths = sequence_lengths
self.sequence_starts = sequence_starts
memory_usage, self.predictions = predict_sequence(input_sentences, return_memory=True)
self.error = theano_lstm.masked_loss(
self.predictions,
input_sentences[:,1:] - self.vocabulary_size,
sequence_lengths,
sequence_starts).mean() + (memory_usage.sum() * self.memory_sparsity) / input_sentences.shape[0]
self.memory_fun = theano.function([input_sentences], memory_usage,
allow_input_downcast=True,
mode=self.theano_mode)
self.predict_fun = theano.function([input_sentences],
self.predictions,
allow_input_downcast=True,
mode=self.theano_mode)
self.error_fun = theano.function([input_sentences, sequence_lengths, sequence_starts],
self.error,
allow_input_downcast=True,
mode=self.theano_mode)
self.input_sentence = T.ivector()
prep_result = predict_sequence(self.input_sentence, return_all=True)
pred_outputs_info = [dict(initial=self.input_sentence[-1], taps=[-1])] + [dict(initial=prep_hidden[-1], taps=[-1]) for prep_hidden in prep_result[1:-1]]
prediction_steps = T.iscalar()
pred_result, _ = theano.scan(pred_step,
n_steps = prediction_steps,
outputs_info = pred_outputs_info)
self.reconstruct_fun = theano.function([self.input_sentence, prediction_steps],
pred_result[0],
allow_input_downcast=True,
mode=self.theano_mode)
self.input_labels = theano.function([input_sentences],
input_sentences[:,1:] - self.vocabulary_size,
mode=self.theano_mode)
if verbose:
print("created prediction & error functions")
updates, gsums, xsums, lr, max_norm = theano_lstm.create_optimization_updates(self.error, model.params, max_norm=None, rho=rho, method="adadelta")
self.lr = lr
if verbose:
print("took the gradient")
self.gsums = gsums
self.xsums = xsums
self.update_fun = theano.function([input_sentences, sequence_lengths, sequence_starts],
outputs=None,
updates=updates,
mode=self.theano_mode)
if verbose:
print("created the gradient descent function")
def reset_weights(self):
for param, xsum, gsum in zip(self.model.params, self.xsums, self.gsums):
xsum.set_value(np.zeros_like(param.get_value(borrow=True)))
gsum.set_value(np.zeros_like(param.get_value(borrow=True)))
param.set_value((np.random.standard_normal(param.get_value(borrow=True).shape) * 1. / param.get_value(borrow=True).shape[0]
).astype(param.dtype))
lattice_model = LatticeModel(word_size=100,
vocabulary_size=400,
stack_size=3,
hidden_size=100,
max_branching_factor=8,
memory_sparsity=0.3,
verbose=True,
rho=0.95)
def read_examples(path, vocab_size):
vocab = Counter()
examples = []
with open(path, "rt") as f:
for line in f:
label, sentence = line.split(" ", 1)
words = sentence.strip().split(" ")
examples.append([words, label])
vocab.update(words)
index2word = []
word2index = {}
for k, (word, word_count) in enumerate(vocab.most_common(vocab_size-3)):
index2word.append(word)
word2index[word] = k
index2word.append("**UNKNOWN**")
word2index["**UNKNOWN**"] = len(word2index)
index2word.append("**END**")
word2index["**END**"] = len(word2index)
return word2index, index2word, examples
import random
def example_to_numerical(example, word2index, codes, vocab_size):
code = list(np.array(random.sample(codes[example[1]], 1)[0]) + vocab_size)
codelen = len(code) + 2 # mark the end of a sequence, and the end of all predictions
end_sequence_symbol = vocab_size + lattice_model.max_branching_factor
end_prediction_symbol = vocab_size + lattice_model.max_branching_factor + 1
return (
[
word2index[word] if word in word2index else word2index["**UNKNOWN**"]
for word in example[0]
] + [word2index["**END**"]] + code + [end_sequence_symbol, end_prediction_symbol],
codelen
)
def examples_to_numerical(examples, word2index, codes, vocab_size):
dataset = []
codelens = []
sequence_lengths = []
for example in examples:
sequence_lengths.append(len(example[0]))
ex, codelen = example_to_numerical(example, word2index, codes, vocab_size)
dataset.append(ex)
codelens.append(codelen)
sequence_lengths = np.array(sequence_lengths, dtype=np.int32)
codelens = np.array(codelens, dtype=np.int32)
max_len_example = max(map(len, dataset))
data = np.zeros([len(dataset), max_len_example], dtype=np.int32)
for k, example in enumerate(dataset):
data[k,:len(example)] = example
return data, codelens, sequence_lengths
def multi_examples(data, examples, codelens, sequence_lengths, num):
multi_codelens = np.zeros([num], dtype=codelens.dtype)
multi_sequence_lengths = np.zeros([num], dtype=sequence_lengths.dtype)
picks = np.random.randint(0, len(data), size=(num, 2))
data_len = max( len(examples[pick[0]][0]) + len(examples[pick[1]][0]) for pick in picks ) + 1 + (codelens.max() * 2) -1
multi_dataset = np.zeros([num, data_len], dtype=data.dtype)
vocab_size = lattice_model.vocabulary_size.get_value()
end_prediction_symbol = vocab_size + lattice_model.max_branching_factor + 1
for k, pick in enumerate(picks):
multi_dataset[k, :sequence_lengths[pick[0]]] = data[pick[0], :sequence_lengths[pick[0]]]
multi_dataset[k, sequence_lengths[pick[0]] : sequence_lengths[pick[0]] + sequence_lengths[pick[1]]] = data[pick[1], :sequence_lengths[pick[1]]]
multi_codelens[k] = codelens[pick[0]] + codelens[pick[1]] - 2 + 1
multi_sequence_lengths[k] = sequence_lengths[pick[0]] + sequence_lengths[pick[1]]
multi_dataset[k, sequence_lengths[pick[0]] + sequence_lengths[pick[1]]] = data[pick[0], sequence_lengths[pick[0]]]
start_code = sequence_lengths[pick[0]] + sequence_lengths[pick[1]] + 1
end_code = start_code + multi_codelens[k]
repeat = True if (
np.all(data[pick[0], sequence_lengths[pick[0]] + 1: sequence_lengths[pick[0]] + 1 + codelens[pick[0]] -1] == data[pick[1], sequence_lengths[pick[1]] + 1: sequence_lengths[pick[1]] + 1 + codelens[pick[1]] -1])
) else False
multi_dataset[k, start_code : start_code + codelens[pick[0]] -1] = data[pick[0], sequence_lengths[pick[0]] + 1: sequence_lengths[pick[0]] + 1 + codelens[pick[0]] -1]
if not repeat:
multi_dataset[k, start_code + codelens[pick[0]] -1 : end_code - 1] = data[pick[1], sequence_lengths[pick[1]] + 1: sequence_lengths[pick[1]] + 1 + codelens[pick[1]] -1]
multi_dataset[k, end_code -1] = end_prediction_symbol
else:
multi_codelens[k] = codelens[pick[0]]
multi_dataset[k, start_code + codelens[pick[0]] -1 : start_code + codelens[pick[0]]] = end_prediction_symbol
return multi_dataset, multi_codelens, multi_sequence_lengths
def reconstruct_visual(example, sequence_length, codelen, example_text, path_decoder):
decoded = lattice_model.reconstruct_fun(example[:sequence_length+1], 20) - lattice_model.vocabulary_size.get_value()
codes = []
code = []
end_sequence_symbol = lattice_model.max_branching_factor
end_prediction_symbol = lattice_model.max_branching_factor + 1
for i in decoded:
if i == end_prediction_symbol:
break
if i == end_sequence_symbol:
codes.append(tuple(code))
code = []
continue
code.append(i)
codenames = [path_decoder[code] if code in path_decoder else "??" for code in codes]
print("%s => %r" % (" ".join(example_text[0]), codenames))
paths = {
"nil": [[]],
"sport": [[0]],
"racket_sport":[[0,0]],
"individual_sport":[[0,1]],
"team_sport": [[0,2]],
"ball_sport": [[0,3]],
"watersport": [[0,4]],
"tennis": [[0,0,0], [0,1,0], [0,3, 0]],
"ping_pong": [[0,0, 1], [0,1,1], [0,3,1]],
"badminton": [[0,0, 2], [0,1, 2], [0,3,2]],
"football": [[0,2, 0], [0,3, 3]],
"baseball": [[0,2, 1], [0,3, 4]],
"rugby": [[0,2, 2], [0,3, 5]],
"polo": [[0,2, 3], [0,3, 6]],
"waterpolo": [[0,4, 0], [0,3, 7]],
"animal_kingdom" : [[1]],
"mammals": [[1, 0]],
"reptiles": [[1, 1]],
"frog": [[1, 1, 0]],
"alligator": [[1, 1, 1]],
"whale": [[1, 0, 0]]
}
code2path = {tuple(code):path for path in paths for code in paths[path]}
word2index, index2word, examples = read_examples("ontology_examples.txt",
400)
data, codelens, sequence_lengths = examples_to_numerical(
examples,
word2index,
paths,
400,
)
multi_data, multi_codelens, multi_sequence_lengths = multi_examples(data, examples,codelens, sequence_lengths, 200)
def to_text(indices):
words = []
vocab_size = lattice_model.vocabulary_size.get_value()
end_seq = vocab_size + lattice_model.max_branching_factor
end_pred = end_seq + 1
for i in indices:
if i == end_pred:
words.append("**END_PRED**")
break
if i == end_seq:
words.append("**LEAF**")
continue
if i >= len(index2word):
words.append(str(i - vocab_size))
continue
words.append(index2word[i])
return " ".join(words)
epochs = 160
errors = []
batchsize = 20
num_batches_a = int(len(data) / batchsize) + 1
num_batches_b = int(len(multi_data) / batchsize) + 1
try:
for epoch in range(epochs):
example_order_a = np.arange(0, len(data))
np.random.shuffle(example_order_a)
for batch in range(num_batches_a):
lattice_model.update_fun(
data[example_order_a][batch * batchsize:(batch+1) * batchsize],
codelens[example_order_a][batch * batchsize:(batch+1) * batchsize],
sequence_lengths[example_order_a][batch * batchsize:(batch+1) * batchsize])
example_order_b = np.arange(0, len(multi_data))
np.random.shuffle(example_order_b)
for batch in range(num_batches_b):
lattice_model.update_fun(
multi_data[example_order_b][batch * batchsize:(batch+1) * batchsize],
multi_codelens[example_order_b][batch * batchsize:(batch+1) * batchsize],
multi_sequence_lengths[example_order_b][batch * batchsize:(batch+1) * batchsize])
if epoch > 0 and epoch % 2 == 0:
errors.append(len(examples) * lattice_model.error_fun(data, codelens, sequence_lengths))
print("epoch %d, error %.3f" % (epoch, errors[-1]))
print("\n")
print("example decoding:\n")
ex_num = np.random.randint(0, len(examples))
reconstruct_visual(data[ex_num],
sequence_lengths[ex_num],
codelens[ex_num],
examples[ex_num],
code2path)
ex_num = np.random.randint(0, len(examples))
reconstruct_visual(data[ex_num],
sequence_lengths[ex_num],
codelens[ex_num],
examples[ex_num],
code2path)
print("\n")
except KeyboardInterrupt:
print("done")
def encode_into_indices(sentence):
seq = []
for word in sentence:
if word in word2index:
seq.append( word2index[word] )
else:
seq.append( word2index["**UNKNOWN**"])
seq.append( word2index["**END**"] )
return seq
def decode_from_indices(from_network):
decoded = []
code = []
for i in from_network:
if i == lattice_model.max_branching_factor + 1:
break
if i == lattice_model.max_branching_factor:
decoded.append(tuple(code))
code= []
continue
code.append(i)
codenames = [code2path[code] if code in code2path else "??" for code in decoded]
return " ".join(codenames)
def beam_search(sentence, n, max_steps = 10):
indices = encode_into_indices(sentence.split() if type(sentence) is str else sentence)
vocab_size = lattice_model.vocabulary_size.get_value()
end_seq = vocab_size + lattice_model.max_branching_factor
end_pred = end_seq + 1
# we start off with n different options:
open_list = []
top_outs, top_probs = beam_search_with_indices(indices, n, prob=1.)
for candidate, prob in zip(top_outs, top_probs):
open_list.append((indices + [candidate + vocab_size], prob))
# for each option we expand another n options forward:
i = 0
while True:
stops = 0
options = [op for op in open_list]
open_list = []
for candidate, prob in options:
if candidate[-1] == end_pred:
stops += 1
open_list.append((candidate, prob))
continue
else:
# if out is not the end sequence token
new_candidates, new_probs = beam_search_with_indices(candidate, n, prob=prob)
for new_candidate, new_prob in zip(new_candidates, new_probs):
open_list.append((candidate + [new_candidate + vocab_size], new_prob))
open_list.sort(key=lambda x: -x[1])
open_list = open_list[:n]
i += 1
if i == max_steps:
break
if stops == n:
break
open_list = [(decode_from_indices(code[len(indices):-1] - vocab_size), prob) for code, prob in open_list]
return open_list
def beam_search_with_indices(indices, size, prob = 1.):
probs = lattice_model.predict_fun([indices])[0,-1]
top_outputs = probs.argsort()[::-1][:size]
top_probs = probs[top_outputs] * prob
return top_outputs, top_probs
print(beam_search("The Komodo dragon can live in all sorts of places . Tennis is a beautiful sport . McEnroe plays really well", 5))
nil . . .
nil is .
nil from .
nil of
nil the
nil stuff
waterpolo Waterpolo is a great sport .
ping_pong Ping Pong is a great sport .
tennis Tennis is a great sport .
football Football is a great sport .
baseball Baseball is a great sport .
polo Polo is a great sport .
ping_pong Japan has won the Ping Pong olympics three years in a row
waterpolo Water Polo is a great water sport
tennis Raphael Nadal has won the grand schlem many times
tennis Federer and Nadal will be playing in the finals
ping_pong Ping Pong tournament in Paris
tennis Rollang Garros tournament in Paris
tennis Wimbledon won by Federer
football Zidane and Ronaldo are great team players
baseball the Oackland As won the game
baseball great home run by the Giants
baseball MLB 's final taking place next week
baseball MLB best player of the year
baseball Oakland As won the MLB
tennis Austrian Open won by Federer
baseball Oakland As used statistics to gain an edge
baseball Babe Ruth is an american favorite
tennis when McEnroe plays there is a lot more than brute force
football Zlatan is not a very talkative football player
waterpolo Water Polo is a team water sport
waterpolo water polo is a water sport
waterpolo Williams Wilson invented the first rules for water polo
racket_sport badminton and tennis are played using rackets
waterpolo Water Polo is a great water sport .
tennis Raphael Nadal has won the grand schlem many times .
tennis Federer and Nadal will be playing in the finals .
ping_pong Ping Pong tournament in Paris .
tennis Rollang Garros tournament in Paris .
tennis Wimbledon won by Federer .
football Zidane and Ronaldo are great team players .
baseball the Oackland As won the game .
baseball great home run by the Giants .
baseball MLB 's final taking place next week .
baseball MLB best player of the year .
baseball Oakland As won the MLB .
tennis Austrian Open won by Federer .
baseball Oakland As used statistics to gain an edge .
baseball Babe Ruth is an american favorite .
tennis when McEnroe plays there is a lot more than brute force .
football Zlatan is not a very talkative football player .
waterpolo Water Polo is a team water sport .
waterpolo water polo is a water sport .
waterpolo Williams Wilson invented the first rules for water polo .
racket_sport badminton and tennis are played using rackets .
rugby 14 teams qualify for the 2013 Rugby League World Cup : Australia , England , New Zealand , Samoa , Wales , Fiji , France , Papua New Guinea , Ireland , Scotland , Tonga , Cook Islands , Italy and United States of America .
rugby New Zealand defeat France 8–7 at Eden Park , Auckland , in the seventh rugby union World Cup , held in New Zealand .
rugby in 1969 Rugby league finally gains recognition as a sport in British universities and colleges .
football A player takes a free kick , while the opposition form a " wall " to try to block the ball
football In the late 1990s and early 2000s , the IFAB experimented with ways of creating a winner without requiring a penalty shootout , which was often seen as an undesirable way to end a match .
football Association football has been played by women since at least the time of the first recorded women 's games in the late 19th century
football Today , football is played at a professional level all over the world . Millions of people regularly go to football stadiums to follow their favourite teams , while billions more watch the game on television or on the internet
tennis Roger Federer is now considered by many observers to have the most " complete " game in modern tennis . He has won 17 grand slam titles and 6 world tour finals , the most for any male player .
tennis McEnroe exacted revenge two months later , beating Borg in the five-set final of the 1980 US Open .
polo The recent surge of excitement in south-east Asia around the game has resulted in its popularity in cities such as Pattaya , Kuala Lumpur and Jakarta .
polo The mounts used are called ' polo ponies ' , although the term pony is purely traditional and the mount is actually a full-sized horse
polo James Gordon Bennett , Jr. on 6 May 1876 organized what was billed as the first polo match in the United States at Dickel's Riding Academy at 39th Street and Fifth Avenue in New York City .
whale The bareback whale can stay under water for several hours
mammals Lions are a pack animal
reptiles Lizards can look in one direction while using their tongue in another
reptiles most frogs can jump several times their body length
reptiles it is currently believed that dinosaurs are the ancestors of chicken
mammals cow milk has only recently been collected by humans
reptiles the Komodo dragon is not only poisonous but also very fast
tennis in tennis reflexes are critical
rugby most rugby players are not as muscular as football players
baseball Baseball grew in popularity during the second world war in the United States
polo In Argentina polo is very popular unlike in most other southern American countries
mammals Dogs and cats are mammals
mammals Dog is man 's best friend
mammals Cats do not usually get along with dogs
whale All whales, dolphins and porpoises are descendants of land-living mammals , most likely of the Artiodactyl order . They entered the water roughly 50 million years ago .
whale The baleen whales are characterized by baleen , a sieve-like structure in the upper jaw made of keratin , which they use to filter plankton from the water . They are the largest species of whale .
whale Like all mammals , whales breathe air into lungs , are warm-blooded , feed their young milk from mammary glands , and have some (although very little ) hair .
whale A young scientist , Eric Alexander Ivanov , in 1911 , was the first to discover that the whale 's ancestors lived on land , and that whales have adapted to a fully aquatic life .
alligator Alligators are characterized by a broader snout and eyes more dorsally located than their crocodile cousins .
alligator According to the Everglades National Park website , the largest alligator ever recorded in Florida was 17 feet 5 inches long ( 5.3 meters ) .
alligator There are only two countries on earth that have alligators : the United States and China .
alligator Although alligators have heavy bodies and slow metabolisms , they are capable of short bursts of speed that can exceed 30 miles per hour though this could more properly be classified as a short fast lunge rather than a dash . Alligators ' main prey are smaller animals that they can kill and eat with a single bite .
alligator Alligators may kill larger prey by grabbing it and dragging it in the water to drown .
alligator Alligators consume food that cannot be eaten in one bite by allowing it to rot or by biting and then spinning or convulsing wildly until bite size pieces are torn off . This is referred to as the " death roll . "
frog Adult frogs are characterised by long hind legs , a short body , webbed digits , protruding eyes and the absence of a tail .
frog Most frogs have a semi-aquatic lifestyle , but move easily on land by jumping or climbing .
whale The baleen whales are characterized by baleen , a sieve-like structure in the upper jaw made of keratin , which they use to filter plankton from the water . They are the largest species of whale
whale Like all mammals , whales breathe air into lungs , are warm-blooded , feed their young milk from mammary glands , and have some (although very little ) hair
whale A young scientist , Eric Alexander Ivanov , in 1911 , was the first to discover that the whale 's ancestors lived on land , and that whales have adapted to a fully aquatic life
alligator Alligators are characterized by a broader snout and eyes more dorsally located than their crocodile cousins
alligator According to the Everglades National Park website , the largest alligator ever recorded in Florida was 17 feet 5 inches long ( 5.3 meters )
alligator There are only two countries on earth that have alligators : the United States and China
alligator Although alligators have heavy bodies and slow metabolisms , they are capable of short bursts of speed that can exceed 30 miles per hour though this could more properly be classified as a short fast lunge rather than a dash . Alligators ' main prey are smaller animals that they can kill and eat with a single bite
alligator Alligators may kill larger prey by grabbing it and dragging it in the water to drown
alligator Alligators consume food that cannot be eaten in one bite by allowing it to rot or by biting and then spinning or convulsing wildly until bite size pieces are torn off . This is referred to as the " death roll .
frog Adult frogs are characterised by long hind legs , a short body , webbed digits , protruding eyes and the absence of a tail
frog Most frogs have a semi-aquatic lifestyle , but move easily on land by jumping or climbing
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment