JonathanRaiman/Lattice_Learning.py

## Lattice_Learning.py
import matplotlib.pyplot as plt
import theano, theano.tensor as T, numpy as np
import theano_lstm
import os
from collections import Counter

to_softmax = lambda x: T.nnet.softmax(x)[0] if x.ndim == 1 else T.nnet.softmax(x.T).T
class LatticeModel(object):
    def __init__(self,
                 word_size,
                 vocabulary_size,
                 stack_size,
                 hidden_size,
                 max_branching_factor,
                 memory_sparsity = 0.0001,
                 rho = 0.95,
                 verbose=False,
                 theano_mode = "FAST_RUN"):

        self.memory_sparsity= theano.shared(np.float64(memory_sparsity), name="memory_sparsity")
        self.theano_mode = theano_mode
        self.word_size = word_size
        self.vocabulary_size = theano.shared(np.int32(vocabulary_size), name="vocabulary_size")
        self.stack_size = stack_size
        self.hidden_size = hidden_size
        self.max_branching_factor = max_branching_factor

        ### CREATE THE CELLS:

        model = theano_lstm.StackedCells(word_size, layers=[hidden_size] * stack_size, celltype=theano_lstm.LSTM, activation=T.tanh)
        # add a softmax layer at the end (non-recurrent)

        # special end token:
        model.layers.append(theano_lstm.Layer(hidden_size, max_branching_factor + 2, to_softmax))

        # add an embedding layer at the beginning (non-recurrent):
        model.layers = [theano_lstm.Embedding(vocabulary_size + max_branching_factor + 2, word_size),
                        theano_lstm.GatedInput(word_size, hidden_size, T.nnet.sigmoid)] + model.layers
        self.model = model


        ### CONSTRUCT THE PREDICTION / WIRING:

        def step(word_id, *prev_hiddens):
            if prev_hiddens[-1].ndim > 1:
                top_level_activ = prev_hiddens[-1][:, self.hidden_size:]
            else:
                top_level_activ = prev_hiddens[-1][self.hidden_size:]

            new_state = model.forward(word_id, [None, top_level_activ] + list(prev_hiddens), [])
            # all outputs should be returned, except embeddings, and the first gates
            return new_state[1:]

        def pred_step(word_id, *prev_hiddens):
            if prev_hiddens[-1].ndim > 1:
                top_level_activ = prev_hiddens[-1][:, self.hidden_size:]
            else:
                top_level_activ = prev_hiddens[-1][self.hidden_size:]
            new_state = model.forward(word_id, [None, top_level_activ] + list(prev_hiddens), [])
            # all outputs should be returned, except embeddings, and the first gates
            return [T.cast(new_state[-1].argmax() + self.vocabulary_size, dtype='int32')] + new_state[2:-1]

        def predict_sequence(x, return_all=False, return_memory=False):
            if x.ndim > 1:
                outputs_info = [None] + [dict(initial=T.repeat(T.shape_padleft(layer.initial_hidden_state), x.shape[0], axis=0), taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')]
            else:
                outputs_info = [None] + [dict(initial=layer.initial_hidden_state, taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')]
            outputs_info = outputs_info + [None]
            result, updates = theano.scan(step,
                                          sequences = [x.T if x.ndim > 1 else x],
                                          outputs_info = outputs_info)

            if return_all:
                return result
            else:
                res = result[-1].dimshuffle(1, 0, 2) if x.ndim > 1 else res
                # gate values can be obtained by asking for them from the stacked cells
                if return_memory:
                    return result[0], res
                else:
                    return res


        # every sequence is a series of indices
        # for words:
        input_sentences    = T.imatrix()

        # some sequences are shorter than others, so we'll note where they
        # end in a zero-indexed fashion
        sequence_lengths  = T.ivector()
        sequence_starts   = T.ivector()
        # the labels are integers in the range of dictionary

        batchsize = theano.shared(100, name="batchsize")
        batch_index = T.iscalar()
        self.batchsize = batchsize

        self.input_sentences = input_sentences
        self.sequence_lengths = sequence_lengths
        self.sequence_starts = sequence_starts

        memory_usage, self.predictions = predict_sequence(input_sentences, return_memory=True)

        self.error = theano_lstm.masked_loss(
            self.predictions,
            input_sentences[:,1:] - self.vocabulary_size,
            sequence_lengths,
            sequence_starts).mean() + (memory_usage.sum() * self.memory_sparsity) / input_sentences.shape[0]

        self.memory_fun = theano.function([input_sentences], memory_usage,
                                           allow_input_downcast=True,
                                           mode=self.theano_mode)
        self.predict_fun = theano.function([input_sentences],
                                           self.predictions,
                                           allow_input_downcast=True,
                                           mode=self.theano_mode)
        self.error_fun = theano.function([input_sentences, sequence_lengths, sequence_starts],
                                         self.error,
                                         allow_input_downcast=True,
                                         mode=self.theano_mode)

        self.input_sentence = T.ivector()

        prep_result = predict_sequence(self.input_sentence, return_all=True)

        pred_outputs_info = [dict(initial=self.input_sentence[-1], taps=[-1])] + [dict(initial=prep_hidden[-1], taps=[-1]) for prep_hidden in prep_result[1:-1]]

        prediction_steps = T.iscalar()
        pred_result, _ = theano.scan(pred_step,
                                     n_steps = prediction_steps,
                                     outputs_info = pred_outputs_info)

        self.reconstruct_fun = theano.function([self.input_sentence, prediction_steps],
                                               pred_result[0],
                                               allow_input_downcast=True,
                                               mode=self.theano_mode)
        self.input_labels = theano.function([input_sentences],
                                            input_sentences[:,1:] - self.vocabulary_size,
                                            mode=self.theano_mode)

        if verbose:
            print("created prediction & error functions")
        updates, gsums, xsums, lr, max_norm = theano_lstm.create_optimization_updates(self.error, model.params, max_norm=None, rho=rho, method="adadelta")
        self.lr = lr
        if verbose:
            print("took the gradient")

        self.gsums = gsums
        self.xsums = xsums

        self.update_fun = theano.function([input_sentences, sequence_lengths, sequence_starts],
                                          outputs=None,
                                          updates=updates,
                                          mode=self.theano_mode)
        if verbose:
            print("created the gradient descent function")

    def reset_weights(self):
        for param, xsum, gsum in zip(self.model.params, self.xsums, self.gsums):
            xsum.set_value(np.zeros_like(param.get_value(borrow=True)))
            gsum.set_value(np.zeros_like(param.get_value(borrow=True)))
            param.set_value((np.random.standard_normal(param.get_value(borrow=True).shape) * 1. /  param.get_value(borrow=True).shape[0]
                             ).astype(param.dtype))


lattice_model = LatticeModel(word_size=100,
                            vocabulary_size=400,
                            stack_size=3,
                            hidden_size=100,
                            max_branching_factor=8,
                            memory_sparsity=0.3,
                            verbose=True,
                            rho=0.95)

def read_examples(path, vocab_size):
    vocab = Counter()
    examples = []
    with open(path, "rt") as f:
        for line in f:
            label, sentence = line.split(" ", 1)
            words = sentence.strip().split(" ")
            examples.append([words, label])
            vocab.update(words)

    index2word = []
    word2index = {}
    for k, (word, word_count) in enumerate(vocab.most_common(vocab_size-3)):
        index2word.append(word)
        word2index[word] = k

    index2word.append("**UNKNOWN**")
    word2index["**UNKNOWN**"] = len(word2index)

    index2word.append("**END**")
    word2index["**END**"] = len(word2index)

    return word2index, index2word, examples

import random
def example_to_numerical(example, word2index, codes, vocab_size):
    code = list(np.array(random.sample(codes[example[1]], 1)[0]) + vocab_size)
    codelen = len(code) + 2 # mark the end of a sequence, and the end of all predictions
    end_sequence_symbol   = vocab_size + lattice_model.max_branching_factor
    end_prediction_symbol = vocab_size + lattice_model.max_branching_factor + 1
    return (
        [
            word2index[word] if word in word2index else word2index["**UNKNOWN**"]
            for word in example[0]
        ] + [word2index["**END**"]] + code + [end_sequence_symbol, end_prediction_symbol],
             codelen
        )

def examples_to_numerical(examples, word2index, codes, vocab_size):
    dataset = []
    codelens = []
    sequence_lengths = []
    for example in examples:
        sequence_lengths.append(len(example[0]))
        ex, codelen = example_to_numerical(example, word2index, codes, vocab_size)
        dataset.append(ex)
        codelens.append(codelen)
    sequence_lengths = np.array(sequence_lengths, dtype=np.int32)
    codelens = np.array(codelens, dtype=np.int32)
    max_len_example = max(map(len, dataset))
    data = np.zeros([len(dataset), max_len_example], dtype=np.int32)
    for k, example in enumerate(dataset):
        data[k,:len(example)] = example

    return data, codelens, sequence_lengths

def multi_examples(data, examples, codelens, sequence_lengths, num):

    multi_codelens = np.zeros([num], dtype=codelens.dtype)
    multi_sequence_lengths = np.zeros([num], dtype=sequence_lengths.dtype)

    picks = np.random.randint(0, len(data), size=(num, 2))

    data_len = max( len(examples[pick[0]][0]) + len(examples[pick[1]][0]) for pick in picks ) + 1 + (codelens.max() * 2) -1


    multi_dataset = np.zeros([num, data_len], dtype=data.dtype)

    vocab_size = lattice_model.vocabulary_size.get_value()
    end_prediction_symbol = vocab_size + lattice_model.max_branching_factor + 1

    for k, pick in enumerate(picks):
        multi_dataset[k, :sequence_lengths[pick[0]]] = data[pick[0], :sequence_lengths[pick[0]]]
        multi_dataset[k, sequence_lengths[pick[0]] : sequence_lengths[pick[0]] + sequence_lengths[pick[1]]] = data[pick[1], :sequence_lengths[pick[1]]]
        multi_codelens[k] = codelens[pick[0]] + codelens[pick[1]] - 2 + 1
        multi_sequence_lengths[k] = sequence_lengths[pick[0]] + sequence_lengths[pick[1]]

        multi_dataset[k, sequence_lengths[pick[0]] + sequence_lengths[pick[1]]] = data[pick[0], sequence_lengths[pick[0]]]

        start_code = sequence_lengths[pick[0]] + sequence_lengths[pick[1]] + 1
        end_code = start_code + multi_codelens[k]
        repeat = True if (
            np.all(data[pick[0], sequence_lengths[pick[0]] + 1: sequence_lengths[pick[0]] + 1 + codelens[pick[0]] -1] == data[pick[1], sequence_lengths[pick[1]] + 1: sequence_lengths[pick[1]] + 1 + codelens[pick[1]] -1])
            ) else False


        multi_dataset[k, start_code : start_code + codelens[pick[0]] -1] = data[pick[0], sequence_lengths[pick[0]] + 1: sequence_lengths[pick[0]] + 1 + codelens[pick[0]] -1]
        if not repeat:
            multi_dataset[k, start_code + codelens[pick[0]] -1 : end_code - 1] = data[pick[1], sequence_lengths[pick[1]] + 1: sequence_lengths[pick[1]] + 1 + codelens[pick[1]] -1]
            multi_dataset[k, end_code -1] = end_prediction_symbol
        else:
            multi_codelens[k] = codelens[pick[0]]
            multi_dataset[k, start_code + codelens[pick[0]] -1 : start_code + codelens[pick[0]]] = end_prediction_symbol

    return multi_dataset, multi_codelens, multi_sequence_lengths


def reconstruct_visual(example, sequence_length, codelen, example_text, path_decoder):
    decoded = lattice_model.reconstruct_fun(example[:sequence_length+1], 20) - lattice_model.vocabulary_size.get_value()

    codes = []
    code = []

    end_sequence_symbol   = lattice_model.max_branching_factor
    end_prediction_symbol = lattice_model.max_branching_factor + 1

    for i in decoded:
        if i == end_prediction_symbol:
            break
        if i == end_sequence_symbol:
            codes.append(tuple(code))
            code = []
            continue
        code.append(i)

    codenames = [path_decoder[code] if code in path_decoder else "??" for code in codes]
    print("%s => %r" % (" ".join(example_text[0]), codenames))

paths = {
    "nil": [[]],
    "sport": [[0]],
    "racket_sport":[[0,0]],
    "individual_sport":[[0,1]],
    "team_sport": [[0,2]],
    "ball_sport": [[0,3]],
    "watersport": [[0,4]],
    "tennis": [[0,0,0], [0,1,0], [0,3, 0]],
    "ping_pong": [[0,0, 1], [0,1,1], [0,3,1]],
    "badminton": [[0,0, 2], [0,1, 2], [0,3,2]],
    "football": [[0,2, 0], [0,3, 3]],
    "baseball": [[0,2, 1], [0,3, 4]],
    "rugby": [[0,2, 2], [0,3, 5]],
    "polo": [[0,2, 3], [0,3, 6]],
    "waterpolo": [[0,4, 0], [0,3, 7]],
    "animal_kingdom" : [[1]],
    "mammals": [[1, 0]],
    "reptiles": [[1, 1]],
    "frog": [[1, 1, 0]],
    "alligator": [[1, 1, 1]],
    "whale": [[1, 0, 0]]
}
code2path = {tuple(code):path for path in paths for code in paths[path]}

word2index, index2word, examples = read_examples("ontology_examples.txt",
                                                 400)
data, codelens, sequence_lengths = examples_to_numerical(
                                            examples,
                                            word2index,
                                            paths,
                                            400,
                                            )

multi_data, multi_codelens, multi_sequence_lengths = multi_examples(data, examples,codelens, sequence_lengths, 200)

def to_text(indices):
    words = []
    vocab_size = lattice_model.vocabulary_size.get_value()
    end_seq =  vocab_size + lattice_model.max_branching_factor
    end_pred = end_seq + 1
    for i in indices:
        if i == end_pred:
            words.append("**END_PRED**")
            break
        if i == end_seq:
            words.append("**LEAF**")
            continue
        if i >= len(index2word):
            words.append(str(i - vocab_size))
            continue
        words.append(index2word[i])
    return " ".join(words)

epochs = 160
errors = []
batchsize = 20
num_batches_a = int(len(data) / batchsize) + 1
num_batches_b = int(len(multi_data) / batchsize) + 1
try:
    for epoch in range(epochs):
        example_order_a = np.arange(0, len(data))
        np.random.shuffle(example_order_a)
        for batch in range(num_batches_a):
            lattice_model.update_fun(
                data[example_order_a][batch * batchsize:(batch+1) * batchsize],
                codelens[example_order_a][batch * batchsize:(batch+1) * batchsize],
                sequence_lengths[example_order_a][batch * batchsize:(batch+1) * batchsize])

        example_order_b = np.arange(0, len(multi_data))
        np.random.shuffle(example_order_b)
        for batch in range(num_batches_b):
            lattice_model.update_fun(
                multi_data[example_order_b][batch * batchsize:(batch+1) * batchsize],
                multi_codelens[example_order_b][batch * batchsize:(batch+1) * batchsize],
                multi_sequence_lengths[example_order_b][batch * batchsize:(batch+1) * batchsize])

        if epoch > 0 and epoch % 2 == 0:
            errors.append(len(examples) * lattice_model.error_fun(data, codelens, sequence_lengths))
            print("epoch %d, error %.3f" % (epoch, errors[-1]))
            print("\n")
            print("example decoding:\n")
            ex_num = np.random.randint(0, len(examples))
            reconstruct_visual(data[ex_num],
                               sequence_lengths[ex_num],
                               codelens[ex_num],
                               examples[ex_num],
                               code2path)
            ex_num = np.random.randint(0, len(examples))
            reconstruct_visual(data[ex_num],
                               sequence_lengths[ex_num],
                               codelens[ex_num],
                               examples[ex_num],
                               code2path)
            print("\n")
except KeyboardInterrupt:
    print("done")


def encode_into_indices(sentence):
    seq = []
    for word in sentence:
        if word in word2index:
            seq.append( word2index[word] )
        else:
            seq.append( word2index["**UNKNOWN**"])
    seq.append( word2index["**END**"] )
    return seq

def decode_from_indices(from_network):
    decoded = []
    code = []
    for i in from_network:
        if i == lattice_model.max_branching_factor + 1:
            break
        if i == lattice_model.max_branching_factor:
            decoded.append(tuple(code))
            code= []
            continue
        code.append(i)
    codenames = [code2path[code] if code in code2path else "??" for code in decoded]


    return " ".join(codenames)


def beam_search(sentence, n, max_steps = 10):
    indices = encode_into_indices(sentence.split() if type(sentence) is str else sentence)
    vocab_size = lattice_model.vocabulary_size.get_value()
    end_seq = vocab_size + lattice_model.max_branching_factor
    end_pred = end_seq + 1
    # we start off with n different options:
    open_list = []
    top_outs, top_probs = beam_search_with_indices(indices, n, prob=1.)
    for candidate, prob in zip(top_outs, top_probs):
        open_list.append((indices + [candidate + vocab_size], prob))

    # for each option we expand another n options forward:
    i = 0
    while True:
        stops = 0

        options = [op for op in open_list]
        open_list = []

        for candidate, prob in options:
            if candidate[-1] == end_pred:
                stops += 1
                open_list.append((candidate, prob))
                continue
            else:
                # if out is not the end sequence token
                new_candidates, new_probs = beam_search_with_indices(candidate, n, prob=prob)
                for new_candidate, new_prob in zip(new_candidates, new_probs):
                    open_list.append((candidate + [new_candidate + vocab_size], new_prob))

        open_list.sort(key=lambda x: -x[1])
        open_list = open_list[:n]
        i += 1
        if i == max_steps:
            break
        if stops == n:
            break

    open_list = [(decode_from_indices(code[len(indices):-1] - vocab_size), prob) for code, prob in open_list]

    return open_list


def beam_search_with_indices(indices, size, prob = 1.):
    probs = lattice_model.predict_fun([indices])[0,-1]


    top_outputs = probs.argsort()[::-1][:size]
    top_probs = probs[top_outputs] * prob

    return top_outputs, top_probs


print(beam_search("The Komodo dragon can live in all sorts of places . Tennis is a beautiful sport . McEnroe plays really well", 5))

## ontology_examples.txt
nil . . .
nil is .
nil from .
nil of
nil the
nil stuff
waterpolo Waterpolo is a great sport .
ping_pong Ping Pong is a great sport .
tennis Tennis is a great sport .
football Football is a great sport .
baseball Baseball is a great sport .
polo Polo is a great sport .
ping_pong Japan has won the Ping Pong olympics three years in a row
waterpolo Water Polo is a great water sport
tennis Raphael Nadal has won the grand schlem many times
tennis Federer and Nadal will be playing in the finals
ping_pong Ping Pong tournament in Paris
tennis Rollang Garros tournament in Paris
tennis Wimbledon won by Federer
football Zidane and Ronaldo are great team players
baseball the Oackland As won the game
baseball great home run by the Giants
baseball MLB 's final taking place next week
baseball MLB best player of the year
baseball Oakland As won the MLB
tennis Austrian Open won by Federer
baseball Oakland As used statistics to gain an edge
baseball Babe Ruth is an american favorite
tennis when McEnroe plays there is a lot more than brute force
football Zlatan is not a very talkative football player
waterpolo Water Polo is a team water sport
waterpolo water polo is a water sport
waterpolo Williams Wilson invented the first rules for water polo
racket_sport badminton and tennis are played using rackets
waterpolo Water Polo is a great water sport .
tennis Raphael Nadal has won the grand schlem many times .
tennis Federer and Nadal will be playing in the finals .
ping_pong Ping Pong tournament in Paris .
tennis Rollang Garros tournament in Paris .
tennis Wimbledon won by Federer .
football Zidane and Ronaldo are great team players .
baseball the Oackland As won the game .
baseball great home run by the Giants .
baseball MLB 's final taking place next week .
baseball MLB best player of the year .
baseball Oakland As won the MLB .
tennis Austrian Open won by Federer .
baseball Oakland As used statistics to gain an edge .
baseball Babe Ruth is an american favorite .
tennis when McEnroe plays there is a lot more than brute force .
football Zlatan is not a very talkative football player .
waterpolo Water Polo is a team water sport .
waterpolo water polo is a water sport .
waterpolo Williams Wilson invented the first rules for water polo .
racket_sport badminton and tennis are played using rackets .
rugby 14 teams qualify for the 2013 Rugby League World Cup : Australia , England , New Zealand , Samoa , Wales , Fiji , France , Papua New Guinea , Ireland , Scotland , Tonga , Cook Islands , Italy and United States of America .
rugby New Zealand defeat France 8–7 at Eden Park , Auckland , in the seventh rugby union World Cup , held in New Zealand .
rugby in 1969 Rugby league finally gains recognition as a sport in British universities and colleges .
football A player takes a free kick , while the opposition form a " wall " to try to block the ball
football In the late 1990s and early 2000s , the IFAB experimented with ways of creating a winner without requiring a penalty shootout , which was often seen as an undesirable way to end a match .
football Association football has been played by women since at least the time of the first recorded women 's games in the late 19th century
football Today , football is played at a professional level all over the world . Millions of people regularly go to football stadiums to follow their favourite teams , while billions more watch the game on television or on the internet
tennis Roger Federer is now considered by many observers to have the most " complete " game in modern tennis . He has won 17 grand slam titles and 6 world tour finals , the most for any male player .
tennis McEnroe exacted revenge two months later , beating Borg in the five-set final of the 1980 US Open .
polo The recent surge of excitement in south-east Asia around the game has resulted in its popularity in cities such as Pattaya , Kuala Lumpur and Jakarta .
polo The mounts used are called ' polo ponies ' , although the term pony is purely traditional and the mount is actually a full-sized horse
polo James Gordon Bennett , Jr. on 6 May 1876 organized what was billed as the first polo match in the United States at Dickel's Riding Academy at 39th Street and Fifth Avenue in New York City .
whale The bareback whale can stay under water for several hours
mammals Lions are a pack animal
reptiles Lizards can look in one direction while using their tongue in another
reptiles most frogs can jump several times their body length
reptiles it is currently believed that dinosaurs are the ancestors of chicken
mammals cow milk has only recently been collected by humans
reptiles the Komodo dragon is not only poisonous but also very fast
tennis in tennis reflexes are critical
rugby most rugby players are not as muscular as football players
baseball Baseball grew in popularity during the second world war in the United States
polo In Argentina polo is very popular unlike in most other southern American countries
mammals Dogs and cats are mammals
mammals Dog is man 's best friend
mammals Cats do not usually get along with dogs
whale All whales, dolphins and porpoises are descendants of land-living mammals , most likely of the Artiodactyl order . They entered the water roughly 50 million years ago .
whale The baleen whales are characterized by baleen , a sieve-like structure in the upper jaw made of keratin , which they use to filter plankton from the water . They are the largest species of whale .
whale Like all mammals , whales breathe air into lungs , are warm-blooded , feed their young milk from mammary glands , and have some (although very little ) hair .
whale A young scientist , Eric Alexander Ivanov , in 1911 , was the first to discover that the whale 's ancestors lived on land , and  that whales have adapted to a fully aquatic life .
alligator Alligators are characterized by a broader snout and eyes more dorsally located than their crocodile cousins .
alligator According to the Everglades National Park website , the largest alligator ever recorded in Florida was 17 feet 5 inches long ( 5.3 meters ) .
alligator There are only two countries on earth that have alligators : the United States and China .
alligator Although alligators have heavy bodies and slow metabolisms , they are capable of short bursts of speed that can exceed 30 miles per hour though this could more properly be classified as a short fast lunge rather than a dash . Alligators ' main prey are smaller animals that they can kill and eat with a single bite .
alligator Alligators may kill larger prey by grabbing it and dragging it in the water to drown .
alligator Alligators consume food that cannot be eaten in one bite by allowing it to rot or by biting and then spinning or convulsing wildly until bite size pieces are torn off . This is referred to as the " death roll . "
frog Adult frogs are characterised by long hind legs , a short body , webbed digits , protruding eyes and the absence of a tail .
frog Most frogs have a semi-aquatic lifestyle , but move easily on land by jumping or climbing .
whale The baleen whales are characterized by baleen , a sieve-like structure in the upper jaw made of keratin , which they use to filter plankton from the water . They are the largest species of whale
whale Like all mammals , whales breathe air into lungs , are warm-blooded , feed their young milk from mammary glands , and have some (although very little ) hair
whale A young scientist , Eric Alexander Ivanov , in 1911 , was the first to discover that the whale 's ancestors lived on land , and  that whales have adapted to a fully aquatic life
alligator Alligators are characterized by a broader snout and eyes more dorsally located than their crocodile cousins
alligator According to the Everglades National Park website , the largest alligator ever recorded in Florida was 17 feet 5 inches long ( 5.3 meters )
alligator There are only two countries on earth that have alligators : the United States and China
alligator Although alligators have heavy bodies and slow metabolisms , they are capable of short bursts of speed that can exceed 30 miles per hour though this could more properly be classified as a short fast lunge rather than a dash . Alligators ' main prey are smaller animals that they can kill and eat with a single bite
alligator Alligators may kill larger prey by grabbing it and dragging it in the water to drown
alligator Alligators consume food that cannot be eaten in one bite by allowing it to rot or by biting and then spinning or convulsing wildly until bite size pieces are torn off . This is referred to as the " death roll .
frog Adult frogs are characterised by long hind legs , a short body , webbed digits , protruding eyes and the absence of a tail
frog Most frogs have a semi-aquatic lifestyle , but move easily on land by jumping or climbing
	import matplotlib.pyplot as plt
	import theano, theano.tensor as T, numpy as np
	import theano_lstm
	import os
	from collections import Counter

	to_softmax = lambda x: T.nnet.softmax(x)[0] if x.ndim == 1 else T.nnet.softmax(x.T).T
	class LatticeModel(object):
	def __init__(self,
	word_size,
	vocabulary_size,
	stack_size,
	hidden_size,
	max_branching_factor,
	memory_sparsity = 0.0001,
	rho = 0.95,
	verbose=False,
	theano_mode = "FAST_RUN"):

	self.memory_sparsity= theano.shared(np.float64(memory_sparsity), name="memory_sparsity")
	self.theano_mode = theano_mode
	self.word_size = word_size
	self.vocabulary_size = theano.shared(np.int32(vocabulary_size), name="vocabulary_size")
	self.stack_size = stack_size
	self.hidden_size = hidden_size
	self.max_branching_factor = max_branching_factor

	### CREATE THE CELLS:

	model = theano_lstm.StackedCells(word_size, layers=[hidden_size] * stack_size, celltype=theano_lstm.LSTM, activation=T.tanh)
	# add a softmax layer at the end (non-recurrent)

	# special end token:
	model.layers.append(theano_lstm.Layer(hidden_size, max_branching_factor + 2, to_softmax))

	# add an embedding layer at the beginning (non-recurrent):
	model.layers = [theano_lstm.Embedding(vocabulary_size + max_branching_factor + 2, word_size),
	theano_lstm.GatedInput(word_size, hidden_size, T.nnet.sigmoid)] + model.layers
	self.model = model


	### CONSTRUCT THE PREDICTION / WIRING:

	def step(word_id, *prev_hiddens):
	if prev_hiddens[-1].ndim > 1:
	top_level_activ = prev_hiddens[-1][:, self.hidden_size:]
	else:
	top_level_activ = prev_hiddens[-1][self.hidden_size:]

	new_state = model.forward(word_id, [None, top_level_activ] + list(prev_hiddens), [])
	# all outputs should be returned, except embeddings, and the first gates
	return new_state[1:]

	def pred_step(word_id, *prev_hiddens):
	if prev_hiddens[-1].ndim > 1:
	top_level_activ = prev_hiddens[-1][:, self.hidden_size:]
	else:
	top_level_activ = prev_hiddens[-1][self.hidden_size:]
	new_state = model.forward(word_id, [None, top_level_activ] + list(prev_hiddens), [])
	# all outputs should be returned, except embeddings, and the first gates
	return [T.cast(new_state[-1].argmax() + self.vocabulary_size, dtype='int32')] + new_state[2:-1]

	def predict_sequence(x, return_all=False, return_memory=False):
	if x.ndim > 1:
	outputs_info = [None] + [dict(initial=T.repeat(T.shape_padleft(layer.initial_hidden_state), x.shape[0], axis=0), taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')]
	else:
	outputs_info = [None] + [dict(initial=layer.initial_hidden_state, taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')]
	outputs_info = outputs_info + [None]
	result, updates = theano.scan(step,
	sequences = [x.T if x.ndim > 1 else x],
	outputs_info = outputs_info)

	if return_all:
	return result
	else:
	res = result[-1].dimshuffle(1, 0, 2) if x.ndim > 1 else res
	# gate values can be obtained by asking for them from the stacked cells
	if return_memory:
	return result[0], res
	else:
	return res


	# every sequence is a series of indices
	# for words:
	input_sentences = T.imatrix()

	# some sequences are shorter than others, so we'll note where they
	# end in a zero-indexed fashion
	sequence_lengths = T.ivector()
	sequence_starts = T.ivector()
	# the labels are integers in the range of dictionary

	batchsize = theano.shared(100, name="batchsize")
	batch_index = T.iscalar()
	self.batchsize = batchsize

	self.input_sentences = input_sentences
	self.sequence_lengths = sequence_lengths
	self.sequence_starts = sequence_starts

	memory_usage, self.predictions = predict_sequence(input_sentences, return_memory=True)

	self.error = theano_lstm.masked_loss(
	self.predictions,
	input_sentences[:,1:] - self.vocabulary_size,
	sequence_lengths,
	sequence_starts).mean() + (memory_usage.sum() * self.memory_sparsity) / input_sentences.shape[0]

	self.memory_fun = theano.function([input_sentences], memory_usage,
	allow_input_downcast=True,
	mode=self.theano_mode)
	self.predict_fun = theano.function([input_sentences],
	self.predictions,
	allow_input_downcast=True,
	mode=self.theano_mode)
	self.error_fun = theano.function([input_sentences, sequence_lengths, sequence_starts],
	self.error,
	allow_input_downcast=True,
	mode=self.theano_mode)

	self.input_sentence = T.ivector()

	prep_result = predict_sequence(self.input_sentence, return_all=True)

	pred_outputs_info = [dict(initial=self.input_sentence[-1], taps=[-1])] + [dict(initial=prep_hidden[-1], taps=[-1]) for prep_hidden in prep_result[1:-1]]

	prediction_steps = T.iscalar()
	pred_result, _ = theano.scan(pred_step,
	n_steps = prediction_steps,
	outputs_info = pred_outputs_info)

	self.reconstruct_fun = theano.function([self.input_sentence, prediction_steps],
	pred_result[0],
	allow_input_downcast=True,
	mode=self.theano_mode)
	self.input_labels = theano.function([input_sentences],
	input_sentences[:,1:] - self.vocabulary_size,
	mode=self.theano_mode)

	if verbose:
	print("created prediction & error functions")
	updates, gsums, xsums, lr, max_norm = theano_lstm.create_optimization_updates(self.error, model.params, max_norm=None, rho=rho, method="adadelta")
	self.lr = lr
	if verbose:
	print("took the gradient")

	self.gsums = gsums
	self.xsums = xsums

	self.update_fun = theano.function([input_sentences, sequence_lengths, sequence_starts],
	outputs=None,
	updates=updates,
	mode=self.theano_mode)
	if verbose:
	print("created the gradient descent function")

	def reset_weights(self):
	for param, xsum, gsum in zip(self.model.params, self.xsums, self.gsums):
	xsum.set_value(np.zeros_like(param.get_value(borrow=True)))
	gsum.set_value(np.zeros_like(param.get_value(borrow=True)))
	param.set_value((np.random.standard_normal(param.get_value(borrow=True).shape) * 1. / param.get_value(borrow=True).shape[0]
	).astype(param.dtype))


	lattice_model = LatticeModel(word_size=100,
	vocabulary_size=400,
	stack_size=3,
	hidden_size=100,
	max_branching_factor=8,
	memory_sparsity=0.3,
	verbose=True,
	rho=0.95)

	def read_examples(path, vocab_size):
	vocab = Counter()
	examples = []
	with open(path, "rt") as f:
	for line in f:
	label, sentence = line.split(" ", 1)
	words = sentence.strip().split(" ")
	examples.append([words, label])
	vocab.update(words)

	index2word = []
	word2index = {}
	for k, (word, word_count) in enumerate(vocab.most_common(vocab_size-3)):
	index2word.append(word)
	word2index[word] = k

	index2word.append("UNKNOWN")
	word2index["UNKNOWN"] = len(word2index)

	index2word.append("END")
	word2index["END"] = len(word2index)

	return word2index, index2word, examples

	import random
	def example_to_numerical(example, word2index, codes, vocab_size):
	code = list(np.array(random.sample(codes[example[1]], 1)[0]) + vocab_size)
	codelen = len(code) + 2 # mark the end of a sequence, and the end of all predictions
	end_sequence_symbol = vocab_size + lattice_model.max_branching_factor
	end_prediction_symbol = vocab_size + lattice_model.max_branching_factor + 1
	return (
	[
	word2index[word] if word in word2index else word2index["UNKNOWN"]
	for word in example[0]
	] + [word2index["END"]] + code + [end_sequence_symbol, end_prediction_symbol],
	codelen
	)

	def examples_to_numerical(examples, word2index, codes, vocab_size):
	dataset = []
	codelens = []
	sequence_lengths = []
	for example in examples:
	sequence_lengths.append(len(example[0]))
	ex, codelen = example_to_numerical(example, word2index, codes, vocab_size)
	dataset.append(ex)
	codelens.append(codelen)
	sequence_lengths = np.array(sequence_lengths, dtype=np.int32)
	codelens = np.array(codelens, dtype=np.int32)
	max_len_example = max(map(len, dataset))
	data = np.zeros([len(dataset), max_len_example], dtype=np.int32)
	for k, example in enumerate(dataset):
	data[k,:len(example)] = example

	return data, codelens, sequence_lengths

	def multi_examples(data, examples, codelens, sequence_lengths, num):

	multi_codelens = np.zeros([num], dtype=codelens.dtype)
	multi_sequence_lengths = np.zeros([num], dtype=sequence_lengths.dtype)

	picks = np.random.randint(0, len(data), size=(num, 2))

	data_len = max( len(examples[pick[0]][0]) + len(examples[pick[1]][0]) for pick in picks ) + 1 + (codelens.max() * 2) -1


	multi_dataset = np.zeros([num, data_len], dtype=data.dtype)

	vocab_size = lattice_model.vocabulary_size.get_value()
	end_prediction_symbol = vocab_size + lattice_model.max_branching_factor + 1

	for k, pick in enumerate(picks):
	multi_dataset[k, :sequence_lengths[pick[0]]] = data[pick[0], :sequence_lengths[pick[0]]]
	multi_dataset[k, sequence_lengths[pick[0]] : sequence_lengths[pick[0]] + sequence_lengths[pick[1]]] = data[pick[1], :sequence_lengths[pick[1]]]
	multi_codelens[k] = codelens[pick[0]] + codelens[pick[1]] - 2 + 1
	multi_sequence_lengths[k] = sequence_lengths[pick[0]] + sequence_lengths[pick[1]]

	multi_dataset[k, sequence_lengths[pick[0]] + sequence_lengths[pick[1]]] = data[pick[0], sequence_lengths[pick[0]]]

	start_code = sequence_lengths[pick[0]] + sequence_lengths[pick[1]] + 1
	end_code = start_code + multi_codelens[k]
	repeat = True if (
	np.all(data[pick[0], sequence_lengths[pick[0]] + 1: sequence_lengths[pick[0]] + 1 + codelens[pick[0]] -1] == data[pick[1], sequence_lengths[pick[1]] + 1: sequence_lengths[pick[1]] + 1 + codelens[pick[1]] -1])
	) else False


	multi_dataset[k, start_code : start_code + codelens[pick[0]] -1] = data[pick[0], sequence_lengths[pick[0]] + 1: sequence_lengths[pick[0]] + 1 + codelens[pick[0]] -1]
	if not repeat:
	multi_dataset[k, start_code + codelens[pick[0]] -1 : end_code - 1] = data[pick[1], sequence_lengths[pick[1]] + 1: sequence_lengths[pick[1]] + 1 + codelens[pick[1]] -1]
	multi_dataset[k, end_code -1] = end_prediction_symbol
	else:
	multi_codelens[k] = codelens[pick[0]]
	multi_dataset[k, start_code + codelens[pick[0]] -1 : start_code + codelens[pick[0]]] = end_prediction_symbol

	return multi_dataset, multi_codelens, multi_sequence_lengths


	def reconstruct_visual(example, sequence_length, codelen, example_text, path_decoder):
	decoded = lattice_model.reconstruct_fun(example[:sequence_length+1], 20) - lattice_model.vocabulary_size.get_value()

	codes = []
	code = []

	end_sequence_symbol = lattice_model.max_branching_factor
	end_prediction_symbol = lattice_model.max_branching_factor + 1

	for i in decoded:
	if i == end_prediction_symbol:
	break
	if i == end_sequence_symbol:
	codes.append(tuple(code))
	code = []
	continue
	code.append(i)

	codenames = [path_decoder[code] if code in path_decoder else "??" for code in codes]
	print("%s => %r" % (" ".join(example_text[0]), codenames))

	paths = {
	"nil": [[]],
	"sport": [[0]],
	"racket_sport":[[0,0]],
	"individual_sport":[[0,1]],
	"team_sport": [[0,2]],
	"ball_sport": [[0,3]],
	"watersport": [[0,4]],
	"tennis": [[0,0,0], [0,1,0], [0,3, 0]],
	"ping_pong": [[0,0, 1], [0,1,1], [0,3,1]],
	"badminton": [[0,0, 2], [0,1, 2], [0,3,2]],
	"football": [[0,2, 0], [0,3, 3]],
	"baseball": [[0,2, 1], [0,3, 4]],
	"rugby": [[0,2, 2], [0,3, 5]],
	"polo": [[0,2, 3], [0,3, 6]],
	"waterpolo": [[0,4, 0], [0,3, 7]],
	"animal_kingdom" : [[1]],
	"mammals": [[1, 0]],
	"reptiles": [[1, 1]],
	"frog": [[1, 1, 0]],
	"alligator": [[1, 1, 1]],
	"whale": [[1, 0, 0]]
	}
	code2path = {tuple(code):path for path in paths for code in paths[path]}

	word2index, index2word, examples = read_examples("ontology_examples.txt",
	400)
	data, codelens, sequence_lengths = examples_to_numerical(
	examples,
	word2index,
	paths,
	400,
	)

	multi_data, multi_codelens, multi_sequence_lengths = multi_examples(data, examples,codelens, sequence_lengths, 200)

	def to_text(indices):
	words = []
	vocab_size = lattice_model.vocabulary_size.get_value()
	end_seq = vocab_size + lattice_model.max_branching_factor
	end_pred = end_seq + 1
	for i in indices:
	if i == end_pred:
	words.append("END_PRED")
	break
	if i == end_seq:
	words.append("LEAF")
	continue
	if i >= len(index2word):
	words.append(str(i - vocab_size))
	continue
	words.append(index2word[i])
	return " ".join(words)

	epochs = 160
	errors = []
	batchsize = 20
	num_batches_a = int(len(data) / batchsize) + 1
	num_batches_b = int(len(multi_data) / batchsize) + 1
	try:
	for epoch in range(epochs):
	example_order_a = np.arange(0, len(data))
	np.random.shuffle(example_order_a)
	for batch in range(num_batches_a):
	lattice_model.update_fun(
	data[example_order_a][batch * batchsize:(batch+1) * batchsize],
	codelens[example_order_a][batch * batchsize:(batch+1) * batchsize],
	sequence_lengths[example_order_a][batch * batchsize:(batch+1) * batchsize])

	example_order_b = np.arange(0, len(multi_data))
	np.random.shuffle(example_order_b)
	for batch in range(num_batches_b):
	lattice_model.update_fun(
	multi_data[example_order_b][batch * batchsize:(batch+1) * batchsize],
	multi_codelens[example_order_b][batch * batchsize:(batch+1) * batchsize],
	multi_sequence_lengths[example_order_b][batch * batchsize:(batch+1) * batchsize])

	if epoch > 0 and epoch % 2 == 0:
	errors.append(len(examples) * lattice_model.error_fun(data, codelens, sequence_lengths))
	print("epoch %d, error %.3f" % (epoch, errors[-1]))
	print("\n")
	print("example decoding:\n")
	ex_num = np.random.randint(0, len(examples))
	reconstruct_visual(data[ex_num],
	sequence_lengths[ex_num],
	codelens[ex_num],
	examples[ex_num],
	code2path)
	ex_num = np.random.randint(0, len(examples))
	reconstruct_visual(data[ex_num],
	sequence_lengths[ex_num],
	codelens[ex_num],
	examples[ex_num],
	code2path)
	print("\n")
	except KeyboardInterrupt:
	print("done")


	def encode_into_indices(sentence):
	seq = []
	for word in sentence:
	if word in word2index:
	seq.append( word2index[word] )
	else:
	seq.append( word2index["UNKNOWN"])
	seq.append( word2index["END"] )
	return seq

	def decode_from_indices(from_network):
	decoded = []
	code = []
	for i in from_network:
	if i == lattice_model.max_branching_factor + 1:
	break
	if i == lattice_model.max_branching_factor:
	decoded.append(tuple(code))
	code= []
	continue
	code.append(i)
	codenames = [code2path[code] if code in code2path else "??" for code in decoded]


	return " ".join(codenames)




	def beam_search(sentence, n, max_steps = 10):
	indices = encode_into_indices(sentence.split() if type(sentence) is str else sentence)
	vocab_size = lattice_model.vocabulary_size.get_value()
	end_seq = vocab_size + lattice_model.max_branching_factor
	end_pred = end_seq + 1
	# we start off with n different options:
	open_list = []
	top_outs, top_probs = beam_search_with_indices(indices, n, prob=1.)
	for candidate, prob in zip(top_outs, top_probs):
	open_list.append((indices + [candidate + vocab_size], prob))

	# for each option we expand another n options forward:
	i = 0
	while True:
	stops = 0

	options = [op for op in open_list]
	open_list = []

	for candidate, prob in options:
	if candidate[-1] == end_pred:
	stops += 1
	open_list.append((candidate, prob))
	continue
	else:
	# if out is not the end sequence token
	new_candidates, new_probs = beam_search_with_indices(candidate, n, prob=prob)
	for new_candidate, new_prob in zip(new_candidates, new_probs):
	open_list.append((candidate + [new_candidate + vocab_size], new_prob))

	open_list.sort(key=lambda x: -x[1])
	open_list = open_list[:n]
	i += 1
	if i == max_steps:
	break
	if stops == n:
	break

	open_list = [(decode_from_indices(code[len(indices):-1] - vocab_size), prob) for code, prob in open_list]

	return open_list


	def beam_search_with_indices(indices, size, prob = 1.):
	probs = lattice_model.predict_fun([indices])[0,-1]


	top_outputs = probs.argsort()[::-1][:size]
	top_probs = probs[top_outputs] * prob

	return top_outputs, top_probs


	print(beam_search("The Komodo dragon can live in all sorts of places . Tennis is a beautiful sport . McEnroe plays really well", 5))
	nil . . .
	nil is .
	nil from .
	nil of
	nil the
	nil stuff
	waterpolo Waterpolo is a great sport .
	ping_pong Ping Pong is a great sport .
	tennis Tennis is a great sport .
	football Football is a great sport .
	baseball Baseball is a great sport .
	polo Polo is a great sport .
	ping_pong Japan has won the Ping Pong olympics three years in a row
	waterpolo Water Polo is a great water sport
	tennis Raphael Nadal has won the grand schlem many times
	tennis Federer and Nadal will be playing in the finals
	ping_pong Ping Pong tournament in Paris
	tennis Rollang Garros tournament in Paris
	tennis Wimbledon won by Federer
	football Zidane and Ronaldo are great team players
	baseball the Oackland As won the game
	baseball great home run by the Giants
	baseball MLB 's final taking place next week
	baseball MLB best player of the year
	baseball Oakland As won the MLB
	tennis Austrian Open won by Federer
	baseball Oakland As used statistics to gain an edge
	baseball Babe Ruth is an american favorite
	tennis when McEnroe plays there is a lot more than brute force
	football Zlatan is not a very talkative football player
	waterpolo Water Polo is a team water sport
	waterpolo water polo is a water sport
	waterpolo Williams Wilson invented the first rules for water polo
	racket_sport badminton and tennis are played using rackets
	waterpolo Water Polo is a great water sport .
	tennis Raphael Nadal has won the grand schlem many times .
	tennis Federer and Nadal will be playing in the finals .
	ping_pong Ping Pong tournament in Paris .
	tennis Rollang Garros tournament in Paris .
	tennis Wimbledon won by Federer .
	football Zidane and Ronaldo are great team players .
	baseball the Oackland As won the game .
	baseball great home run by the Giants .
	baseball MLB 's final taking place next week .
	baseball MLB best player of the year .
	baseball Oakland As won the MLB .
	tennis Austrian Open won by Federer .
	baseball Oakland As used statistics to gain an edge .
	baseball Babe Ruth is an american favorite .
	tennis when McEnroe plays there is a lot more than brute force .
	football Zlatan is not a very talkative football player .
	waterpolo Water Polo is a team water sport .
	waterpolo water polo is a water sport .
	waterpolo Williams Wilson invented the first rules for water polo .
	racket_sport badminton and tennis are played using rackets .
	rugby 14 teams qualify for the 2013 Rugby League World Cup : Australia , England , New Zealand , Samoa , Wales , Fiji , France , Papua New Guinea , Ireland , Scotland , Tonga , Cook Islands , Italy and United States of America .
	rugby New Zealand defeat France 8–7 at Eden Park , Auckland , in the seventh rugby union World Cup , held in New Zealand .
	rugby in 1969 Rugby league finally gains recognition as a sport in British universities and colleges .
	football A player takes a free kick , while the opposition form a " wall " to try to block the ball
	football In the late 1990s and early 2000s , the IFAB experimented with ways of creating a winner without requiring a penalty shootout , which was often seen as an undesirable way to end a match .
	football Association football has been played by women since at least the time of the first recorded women 's games in the late 19th century
	football Today , football is played at a professional level all over the world . Millions of people regularly go to football stadiums to follow their favourite teams , while billions more watch the game on television or on the internet
	tennis Roger Federer is now considered by many observers to have the most " complete " game in modern tennis . He has won 17 grand slam titles and 6 world tour finals , the most for any male player .
	tennis McEnroe exacted revenge two months later , beating Borg in the five-set final of the 1980 US Open .
	polo The recent surge of excitement in south-east Asia around the game has resulted in its popularity in cities such as Pattaya , Kuala Lumpur and Jakarta .
	polo The mounts used are called ' polo ponies ' , although the term pony is purely traditional and the mount is actually a full-sized horse
	polo James Gordon Bennett , Jr. on 6 May 1876 organized what was billed as the first polo match in the United States at Dickel's Riding Academy at 39th Street and Fifth Avenue in New York City .
	whale The bareback whale can stay under water for several hours
	mammals Lions are a pack animal
	reptiles Lizards can look in one direction while using their tongue in another
	reptiles most frogs can jump several times their body length
	reptiles it is currently believed that dinosaurs are the ancestors of chicken
	mammals cow milk has only recently been collected by humans
	reptiles the Komodo dragon is not only poisonous but also very fast
	tennis in tennis reflexes are critical
	rugby most rugby players are not as muscular as football players
	baseball Baseball grew in popularity during the second world war in the United States
	polo In Argentina polo is very popular unlike in most other southern American countries
	mammals Dogs and cats are mammals
	mammals Dog is man 's best friend
	mammals Cats do not usually get along with dogs
	whale All whales, dolphins and porpoises are descendants of land-living mammals , most likely of the Artiodactyl order . They entered the water roughly 50 million years ago .
	whale The baleen whales are characterized by baleen , a sieve-like structure in the upper jaw made of keratin , which they use to filter plankton from the water . They are the largest species of whale .
	whale Like all mammals , whales breathe air into lungs , are warm-blooded , feed their young milk from mammary glands , and have some (although very little ) hair .
	whale A young scientist , Eric Alexander Ivanov , in 1911 , was the first to discover that the whale 's ancestors lived on land , and that whales have adapted to a fully aquatic life .
	alligator Alligators are characterized by a broader snout and eyes more dorsally located than their crocodile cousins .
	alligator According to the Everglades National Park website , the largest alligator ever recorded in Florida was 17 feet 5 inches long ( 5.3 meters ) .
	alligator There are only two countries on earth that have alligators : the United States and China .
	alligator Although alligators have heavy bodies and slow metabolisms , they are capable of short bursts of speed that can exceed 30 miles per hour though this could more properly be classified as a short fast lunge rather than a dash . Alligators ' main prey are smaller animals that they can kill and eat with a single bite .
	alligator Alligators may kill larger prey by grabbing it and dragging it in the water to drown .
	alligator Alligators consume food that cannot be eaten in one bite by allowing it to rot or by biting and then spinning or convulsing wildly until bite size pieces are torn off . This is referred to as the " death roll . "
	frog Adult frogs are characterised by long hind legs , a short body , webbed digits , protruding eyes and the absence of a tail .
	frog Most frogs have a semi-aquatic lifestyle , but move easily on land by jumping or climbing .
	whale The baleen whales are characterized by baleen , a sieve-like structure in the upper jaw made of keratin , which they use to filter plankton from the water . They are the largest species of whale
	whale Like all mammals , whales breathe air into lungs , are warm-blooded , feed their young milk from mammary glands , and have some (although very little ) hair
	whale A young scientist , Eric Alexander Ivanov , in 1911 , was the first to discover that the whale 's ancestors lived on land , and that whales have adapted to a fully aquatic life
	alligator Alligators are characterized by a broader snout and eyes more dorsally located than their crocodile cousins
	alligator According to the Everglades National Park website , the largest alligator ever recorded in Florida was 17 feet 5 inches long ( 5.3 meters )
	alligator There are only two countries on earth that have alligators : the United States and China
	alligator Although alligators have heavy bodies and slow metabolisms , they are capable of short bursts of speed that can exceed 30 miles per hour though this could more properly be classified as a short fast lunge rather than a dash . Alligators ' main prey are smaller animals that they can kill and eat with a single bite
	alligator Alligators may kill larger prey by grabbing it and dragging it in the water to drown
	alligator Alligators consume food that cannot be eaten in one bite by allowing it to rot or by biting and then spinning or convulsing wildly until bite size pieces are torn off . This is referred to as the " death roll .
	frog Adult frogs are characterised by long hind legs , a short body , webbed digits , protruding eyes and the absence of a tail
	frog Most frogs have a semi-aquatic lifestyle , but move easily on land by jumping or climbing