Skip to content

Instantly share code, notes, and snippets.

@JakubMifek
Created May 10, 2019 10:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JakubMifek/8051af0906878cbb65a3dcad906768cc to your computer and use it in GitHub Desktop.
Save JakubMifek/8051af0906878cbb65a3dcad906768cc to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import numpy as np
import tensorflow as tf
import decoder
from morpho_dataset import MorphoDataset
class Network:
def __init__(self, args, num_source_chars, num_target_chars):
class Model(tf.keras.Model):
def __init__(self):
super().__init__()
# TODO(lemmatizer_noattn): Define
# - source_embeddings as a masked embedding layer of source chars into args.cle_dim dimensions
self.source_embeddings = tf.keras.layers.Embedding(
input_dim = num_source_chars,
output_dim = args.cle_dim,
mask_zero = True
)
# TODO: Define
# - source_rnn as a bidirectional GRU with args.rnn_dim units, returning _whole sequences_, summing opposite directions
self.source_rnn = tf.keras.layers.Bidirectional(
tf.keras.layers.GRU(
args.rnn_cell_dim,
return_sequences=True
)
)
# TODO(lemmatizer_noattn): Define
# - target_embedding as an unmasked embedding layer of target chars into args.cle_dim dimensions
# - target_rnn_cell as a GRUCell with args.rnn_dim units
# - target_output_layer as a Dense layer into `num_target_chars`
self.target_embedding = tf.keras.layers.Embedding(
output_dim = args.cle_dim,
mask_zero = False
)
self.target_rnn_cell = tf.keras.layers.GRU(
args.rnn_cell_dim
)
self.target_output_layer = tf.keras.layers.Dense(
num_target_chars
)
# TODO: Define
# - attention_source_layer as a Dense layer with args.rnn_dim outputs
# - attention_state_layer as a Dense layer with args.rnn_dim outputs
# - attention_weight_layer as a Dense layer with 1 output
self.attention_source_layer = tf.keras.layers.Dense(
args.rnn_dim
)
self.attention_state_layer = tf.keras.layers.Dense(
args.rnn_dim
)
self.attention_weight_layer = tf.keras.layers.Dense(
1
)
self._model = Model()
self._optimizer = tf.optimizers.Adam()
# TODO(lemmatizer_noattn): Define self._loss as SparseCategoricalCrossentropy which processes _logits_ instead of probabilities
self._metrics_training = {"loss": tf.metrics.Mean(), "accuracy": tf.metrics.SparseCategoricalAccuracy()}
self._metrics_evaluation = {"accuracy": tf.metrics.Mean()}
self._writer = tf.summary.create_file_writer(args.logdir, flush_millis=10 * 1000)
def _append_eow(self, sequences):
"""Append EOW character after end every given sequence."""
sequences_rev = tf.reverse_sequence(sequences, tf.reduce_sum(tf.cast(tf.not_equal(sequences, 0), tf.int32), axis=1), 1)
sequences_rev_eow = tf.pad(sequences_rev, [[0, 0], [1, 0]], constant_values=MorphoDataset.Factor.EOW)
return tf.reverse_sequence(sequences_rev_eow, tf.reduce_sum(tf.cast(tf.not_equal(sequences_rev_eow, 0), tf.int32), axis=1), 1)
@tf.function(input_signature=[tf.TensorSpec(shape=[None, None], dtype=tf.int32)] * 4, autograph=False)
def train_batch(self, source_charseq_ids, source_charseqs, target_charseq_ids, target_charseqs):
# TODO(lemmatizer_noattn): Modify target_charseqs by appending EOW; only the version with appended EOW is used from now on.
with tf.GradientTape() as tape:
# TODO(lemmatizer_noattn): Embed source charseqs
# TODO: Run self._model.source_rnn on the embedded sequences, returning outputs in `source_encoded`.
# Copy the source_encoded to corresponding batch places, and then flatten it
source_mask = tf.not_equal(source_charseq_ids, 0)
source_encoded = tf.boolean_mask(tf.gather(source_encoded, source_charseq_ids), source_mask)
targets = tf.boolean_mask(tf.gather(target_charseqs, target_charseq_ids), source_mask)
class DecoderTraining(decoder.BaseDecoder):
@property
def batch_size(self): raise NotImplemented() # TODO: Return the batch size of self._source_encoded, using tf.shape
@property
def output_size(self): raise NotImplemented() # TODO(lemmatizer_noattn): Return the number logits per each output
@property
def output_dtype(self): raise NotImplemented() # TODO(lemmatizer_noattn): Return the type of the logits
def _with_attention(self, inputs, states):
# TODO: Compute the attention.
# - Take self._source_encoded and pass it through the self._model.attention_source_layer.
# Because self._source_encoded does not change, you should in fact do it in `initialize`.
# - Pass `states` though self._model.attention_state_layer.
# - Sum the two outputs. However, the first has shape [a, b, c] and the second [a, c]. Therefore,
# somehow expand the second to [a, b, c] first. (Hint: use broadcasting rules.)
# - Pass the sum through `tf.tanh`, then self._model.attention_weight_layer.
# - Then, run softmax on a suitable axis (the one corresponding to characters), generating `weights`.
# - Multiply `self._source_encoded` with `weights` and sum the result in the axis
# corresponding to characters, generating `attention`. Therefore, `attention` is a a fixed-size
# representation for every batch element, independently on how many characters had
# the corresponding input forms.
# - Finally concatenate `inputs` and `attention` and return the result.
def initialize(self, layer_inputs, initial_state=None):
self._model, self._source_encoded, self._targets = layer_inputs
# TODO(lemmatozer_noattn): Define `finished` as a vector of self.batch_size of `False` [see tf.fill].
# TODO(lemmatizer_noattn): Define `inputs` as a vector of self.batch_size of MorphoDataset.Factor.BOW [see tf.fill],
# embedded using self._model.target_embedding
# TODO: Define `states` as the last words from self._source_encoded
# TODO: Pass `inputs` through `self._with_attention(inputs, states)`.
return finished, inputs, states
def step(self, time, inputs, states):
# TODO(lemmatizer_noattn): Pass `inputs` and `[states]` through self._model.target_rnn_cell, generating
# `outputs, [states]`.
# TODO(lemmatizer_noattn): Overwrite `outputs` by passing them through self._model.target_output_layer,
# TODO(lemmatizer_noattn): Define `next_inputs` by embedding `time`-th words from `self._targets`.
# TODO(lemmatizer_noattn): Define `finished` as True if `time`-th word from `self._targets` is EOW, False otherwise.
# Again, no == or !=.
# TODO: Pass `next_inputs` through `self._with_attention(inputs, states)`.
return outputs, states, next_inputs, finished
output_layer, _, _ = DecoderTraining()([self._model, source_encoded, targets])
# TODO(lemmatizer_noattn): Compute loss. Use only nonzero `targets` as a mask.
gradients = tape.gradient(loss, self._model.variables)
self._optimizer.apply_gradients(zip(gradients, self._model.variables))
tf.summary.experimental.set_step(self._optimizer.iterations)
with self._writer.as_default():
for name, metric in self._metrics_training.items():
metric.reset_states()
if name == "loss": metric(loss)
else: metric(targets, output_layer, tf.not_equal(targets, 0))
tf.summary.scalar("train/{}".format(name), metric.result())
return tf.math.argmax(output_layer, axis=2)
def train_epoch(self, dataset, args):
for batch in dataset.batches(args.batch_size):
# TODO(lemmatizer_noattn): Call train_batch, storing results in `predictions`.
form, gold_lemma, system_lemma = "", "", ""
for i in batch[dataset.FORMS].charseqs[1]:
if i: form += dataset.data[dataset.FORMS].alphabet[i]
for i in range(len(batch[dataset.LEMMAS].charseqs[1])):
if batch[dataset.LEMMAS].charseqs[1][i]:
gold_lemma += dataset.data[dataset.LEMMAS].alphabet[batch[dataset.LEMMAS].charseqs[1][i]]
system_lemma += dataset.data[dataset.LEMMAS].alphabet[predictions[0][i]]
print(float(self._metrics_training["accuracy"].result()), form, gold_lemma, system_lemma)
@tf.function(input_signature=[tf.TensorSpec(shape=[None, None], dtype=tf.int32)] * 2, autograph=False)
def predict_batch(self, source_charseq_ids, source_charseqs):
# TODO(lemmatizer_noattn)(train_batch): Embed source charseqs
# TODO(train_batch): Run self._model.source_rnn on the embedded sequences, returning outputs in `source_encoded`.
# Copy the source_encoded to corresponding batch places, and then flatten it
source_mask = tf.not_equal(source_charseq_ids, 0)
source_encoded = tf.boolean_mask(tf.gather(source_encoded, source_charseq_ids), source_mask)
class DecoderPrediction(decoder.BaseDecoder):
@property
def batch_size(self): raise NotImplemented() # TODO(lemmatizer_noattn)(train_batch): Return the batch size of self._source_encoded, using tf.shape
@property
def output_size(self): raise NotImplemented() # TODO(lemmatizer_noattn): Return 1 because we are returning directly the predictions
@property
def output_dtype(self): return NotImplemented() # TODO(lemmatizer_noattn): Return tf.int32 because the predictions are integral
def _with_attention(self, inputs, states):
# TODO: A copy of _with_attention from train_batch; you can of course
# move the definition to a place where it can be reused in both places.
def initialize(self, layer_inputs, initial_state=None):
self._model, self._source_encoded = layer_inputs
# TODO(lemmatizer_noattn)(train_batch): Define `finished` as a vector of self.batch_size of `False` [see tf.fill].
# TODO(lemmatizer_noattn)(train_batch): Define `inputs` as a vector of self.batch_size of MorphoDataset.Factor.BOW [see tf.fill],
# embedded using self._model.target_embedding
# TODO(train_batch): Define `states` as the last words from self._source_encoded
# TODO(train_batch): Pass `inputs` through `self._with_attention(inputs, states)`.
return finished, inputs, states
def step(self, time, inputs, states):
# TODO(lemmatizer_noattn)(train_batch): Pass `inputs` and `[states]` through self._model.target_rnn_cell, generating
# `outputs, [states]`.
# TODO(lemmatizer_noattn)(train_batch): Overwrite `outputs` by passing them through self._model.target_output_layer,
# TODO(lemmatizer_noattn): Overwirte `outputs` by passing them through `tf.argmax` on suitable axis and with
# `output_type=tf.int32` parameter.
# TODO(lemmatizer_noattn): Define `next_inputs` by embedding the `outputs`
# TODO(lemmatizer_noattn): Define `finished` as True if `outputs` are EOW, False otherwise. [No == or !=].
# TODO(train_batch): Pass `next_inputs` through `self._with_attention(inputs, states)`.
return outputs, states, next_inputs, finished
predictions, _, _ = DecoderPrediction(maximum_iterations=tf.shape(source_charseqs)[1] + 10)([self._model, source_encoded])
return predictions
@tf.function(input_signature=[tf.TensorSpec(shape=[None, None], dtype=tf.int32)] * 4, autograph=False)
def evaluate_batch(self, source_charseq_ids, source_charseqs, target_charseq_ids, target_charseqs):
# Predict
predictions = self.predict_batch(source_charseq_ids, source_charseqs)
# Append EOW to target_charseqs and copy them to corresponding places and flatten it
target_charseqs = self._append_eow(target_charseqs)
targets = tf.boolean_mask(tf.gather(target_charseqs, target_charseq_ids), tf.not_equal(source_charseq_ids, 0))
# Compute accuracy, but on the whole sequences
mask = tf.cast(tf.not_equal(targets, 0), tf.int32)
resized_predictions = tf.concat([predictions, tf.zeros_like(targets)], axis=1)[:, :tf.shape(targets)[1]]
equals = tf.reduce_all(tf.equal(resized_predictions * mask, targets * mask), axis=1)
self._metrics_evaluation["accuracy"](equals)
def evaluate(self, dataset, dataset_name, args):
for metric in self._metrics_evaluation.values():
metric.reset_states()
for batch in dataset.batches(args.batch_size):
predictions = self.evaluate_batch(batch[dataset.FORMS].charseq_ids, batch[dataset.FORMS].charseqs,
batch[dataset.LEMMAS].charseq_ids, batch[dataset.LEMMAS].charseqs)
metrics = {name: float(metric.result()) for name, metric in self._metrics_evaluation.items()}
with self._writer.as_default():
for name, value in metrics.items():
tf.summary.scalar("{}/{}".format(dataset_name, name), value)
return metrics
if __name__ == "__main__":
import argparse
import datetime
import os
import re
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", default=10, type=int, help="Batch size.")
parser.add_argument("--cle_dim", default=64, type=int, help="CLE embedding dimension.")
parser.add_argument("--epochs", default=10, type=int, help="Number of epochs.")
parser.add_argument("--max_sentences", default=5000, type=int, help="Maximum number of sentences to load.")
parser.add_argument("--recodex", default=False, action="store_true", help="Evaluation in ReCodEx.")
parser.add_argument("--rnn_dim", default=64, type=int, help="RNN cell dimension.")
parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
args = parser.parse_args()
# Fix random seeds and number of threads
np.random.seed(42)
tf.random.set_seed(42)
if args.recodex:
tf.keras.utils.get_custom_objects()["glorot_uniform"] = lambda: tf.initializers.glorot_uniform(seed=42)
tf.keras.utils.get_custom_objects()["orthogonal"] = lambda: tf.initializers.orthogonal(seed=42)
tf.keras.utils.get_custom_objects()["uniform"] = lambda: tf.initializers.RandomUniform(seed=42)
tf.config.threading.set_inter_op_parallelism_threads(args.threads)
tf.config.threading.set_intra_op_parallelism_threads(args.threads)
# Create logdir name
args.logdir = os.path.join("logs", "{}-{}-{}".format(
os.path.basename(__file__),
datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items())))
))
# Load the data
morpho = MorphoDataset("czech_cac", max_sentences=args.max_sentences)
# Create the network and train
network = Network(args,
num_source_chars=len(morpho.train.data[morpho.train.FORMS].alphabet),
num_target_chars=len(morpho.train.data[morpho.train.LEMMAS].alphabet))
for epoch in range(args.epochs):
network.train_epoch(morpho.train, args)
metrics = network.evaluate(morpho.dev, "dev", args)
print("Evaluation on {}, epoch {}: {}".format("dev", epoch + 1, metrics))
metrics = network.evaluate(morpho.test, "test", args)
with open("lemmatizer.out", "w") as out_file:
print("{:.2f}".format(100 * metrics["accuracy"]), file=out_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment