Skip to content

Instantly share code, notes, and snippets.

@bjourne
Created June 21, 2020 06:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bjourne/486bad6b28320735c2283f03facd4a99 to your computer and use it in GitHub Desktop.
Save bjourne/486bad6b28320735c2283f03facd4a99 to your computer and use it in GitHub Desktop.
"""
Character-based language model in TensorFlow
============================================
A character-based language model written in TensorFlow and trained on
the Penn Treebank dataset. The model can either be trained using TFs
model.fit() method (auto) or using a custom training loop
(manual). Results should not differ.
Usage:
char_lm_tf.py [options] ( manual | auto )
Options:
-h --help show this screen
-v --verbose print more output
--batch-size=<i> batch size [default: 32]
--epochs=<i> number of epochs [default: 200]
--seq-len=<i> sequence length [default: 320]
Validation losses:
best ep10 ep20 ep50 ep100
auto 0.9483 1.0704 0.9892 0.9483 0.9483
auto 0.9470 1.0924 1.0059 0.9496 0.9470
auto 0.9483 1.0783 0.9732 0.9483 0.9483
auto 0.9534 1.0626 0.9885 0.9534 0.9534
auto 0.9544 1.0447 0.9770 0.9544 0.9544
manual 0.9544 1.0595 0.9848 0.9544 0.9544
manual 0.9483 1.1425 1.0478 0.9628 0.9483
manual 0.9490 1.0937 1.0067 0.9516 0.9490
manual 0.9491 1.2303 1.0864 0.9728 0.9491
manual 0.9465 1.1122 1.0206 0.9520 0.9465
"""
# from docopt import docopt
from observations import ptb
from os import environ
from pathlib import Path
from tensorflow.config import experimental_connect_to_cluster
from tensorflow.data import Dataset
from tensorflow.distribute import OneDeviceStrategy
from tensorflow.distribute.cluster_resolver import TPUClusterResolver
from tensorflow.distribute.experimental import TPUStrategy
from tensorflow.keras import Model, losses, metrics
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.tpu.experimental import initialize_tpu_system
from time import time
import tensorflow as tf
def compute_and_apply_gradients(model, x, y):
with tf.GradientTape() as tape:
y_hat = model(x, training = True)
loss = model.compiled_loss(y, y_hat,
regularization_losses = model.losses)
vars = model.trainable_variables
grads = tape.gradient(loss, vars)
grads = [tf.clip_by_norm(g, 0.5) for g in grads]
model.optimizer.apply_gradients(zip(grads, vars))
return y_hat
class MyModel(Model):
def train_step(self, data):
x, y = data
y_hat = compute_and_apply_gradients(self, x, y)
self.compiled_metrics.update_state(y, y_hat)
return {m.name: m.result() for m in self.metrics}
def create_model(seq_len, vocab_size):
inp = Input(shape = (seq_len,), batch_size = None, dtype = tf.int32)
emb = Embedding(input_dim = vocab_size, output_dim = 100)
lstm = LSTM(700, return_sequences = True, dropout = 0.3)
time_dist = TimeDistributed(Dense(vocab_size, activation = 'softmax'))
out = time_dist(lstm(emb(inp)))
return MyModel(inputs = [inp], outputs = [out])
def sequence_to_samples(seq, seq_len):
def split_input_target(chunk):
input_text = chunk[:-1]
target_text = chunk[1:]
return input_text, target_text
src = tf.constant(seq, dtype = tf.int32)
return Dataset.from_tensor_slices(src) \
.batch(seq_len + 1, drop_remainder = True) \
.map(split_input_target)
def select_strategy():
tpu_addr = environ.get('COLAB_TPU_ADDR')
if not tpu_addr:
return OneDeviceStrategy(device = "/cpu:0")
resolver = TPUClusterResolver('grpc://' + tpu_addr)
experimental_connect_to_cluster(resolver)
initialize_tpu_system(resolver)
return TPUStrategy(resolver)
def distribute_dataset(strategy, dataset, batch_size):
def dataset_fn(ctx):
return dataset.batch(batch_size, drop_remainder = True)
return strategy.experimental_distribute_datasets_from_function(
dataset_fn)
class LossAccObserver:
def __init__(self):
self.loss = metrics.SparseCategoricalCrossentropy()
self.acc = metrics.SparseCategoricalAccuracy()
def reset(self):
self.loss.reset_states()
self.acc.reset_states()
def update(self, y, y_hat):
self.loss.update_state(y, y_hat)
self.acc.update_state(y, y_hat)
@tf.function
def train_epoch(model, strategy, batch_size, dataset, obs):
def step_fn(x, y):
y_hat = compute_and_apply_gradients(model, x, y)
obs.update(y, y_hat)
for x, y in dataset:
strategy.run(step_fn, args = (x, y))
@tf.function
def evaluate_epoch(model, strategy, dataset, obs):
def step_fn(x, y):
y_hat = model(x, training = False)
obs.update(y, y_hat)
for x, y in dataset:
strategy.run(step_fn, args = (x, y))
def manual_training(model, strategy, train, valid, batch_size, epochs):
with strategy.scope():
train_obs = LossAccObserver()
valid_obs = LossAccObserver()
batch_size_per_replica = batch_size // strategy.num_replicas_in_sync
train = distribute_dataset(strategy, train, batch_size_per_replica)
valid = distribute_dataset(strategy, valid, batch_size_per_replica)
fmt = '\-> %3d / %3d - %4db - %3ds - %.4f / %.4f - %.2f / %.2f %s'
val_losses = []
last_time = time()
last_n_steps = 0
for i in range(epochs):
start = time()
train_epoch(model, strategy, batch_size, train, train_obs)
evaluate_epoch(model, strategy, valid, valid_obs)
new_time = time()
val_loss = valid_obs.loss.result()
new_n_steps = model.optimizer.iterations.numpy()
time_delta = new_time - last_time
n_steps_delta = new_n_steps - last_n_steps
mark = ' '
if val_loss < min(val_losses, default = 100):
mark = '*'
args = (i + 1, epochs, n_steps_delta, time_delta,
train_obs.loss.result(), val_loss,
train_obs.acc.result(), valid_obs.acc.result(), mark)
print(fmt % args)
last_time = new_time
last_n_steps = new_n_steps
val_losses.append(val_loss)
train_obs.reset()
valid_obs.reset()
def automatic_training(model, train, valid, batch_size, epochs):
train = train.batch(batch_size, drop_remainder = True)
valid = valid.batch(batch_size, drop_remainder = True)
model.fit(x = train, validation_data = valid,
epochs = epochs,
verbose = 2)
def main():
# Parameters.
# args = docopt(__doc__, version = 'Char-based LM in TF 1.0')
# batch_size = int(args['--batch-size'])
# seq_len = int(args['--seq-len'])
# epochs = int(args['--epochs'])
# manual_mode = True if args['manual'] else False
batch_size = 32
seq_len = 320
epochs = 200
manual_mode = True
# Select strategy
strategy = select_strategy()
# Load and transform data.
train, _, valid = ptb('./data')
ix2ch = sorted(set(train))
ch2ix = {c : i for i, c in enumerate(ix2ch)}
train = sequence_to_samples([ch2ix[c] for c in train], seq_len)
valid = sequence_to_samples([ch2ix[c] for c in valid], seq_len)
vocab_size = len(ix2ch)
# Create model and optimizer.
with strategy.scope():
model = create_model(seq_len, vocab_size)
model.compile(
optimizer = SGD(learning_rate = 4),
loss = 'sparse_categorical_crossentropy',
metrics = ['sparse_categorical_accuracy'])
if manual_mode:
manual_training(model, strategy, train, valid, batch_size, epochs)
else:
automatic_training(model, train, valid, batch_size, epochs)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment