Created
June 21, 2020 06:12
-
-
Save bjourne/486bad6b28320735c2283f03facd4a99 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Character-based language model in TensorFlow | |
============================================ | |
A character-based language model written in TensorFlow and trained on | |
the Penn Treebank dataset. The model can either be trained using TFs | |
model.fit() method (auto) or using a custom training loop | |
(manual). Results should not differ. | |
Usage: | |
char_lm_tf.py [options] ( manual | auto ) | |
Options: | |
-h --help show this screen | |
-v --verbose print more output | |
--batch-size=<i> batch size [default: 32] | |
--epochs=<i> number of epochs [default: 200] | |
--seq-len=<i> sequence length [default: 320] | |
Validation losses: | |
best ep10 ep20 ep50 ep100 | |
auto 0.9483 1.0704 0.9892 0.9483 0.9483 | |
auto 0.9470 1.0924 1.0059 0.9496 0.9470 | |
auto 0.9483 1.0783 0.9732 0.9483 0.9483 | |
auto 0.9534 1.0626 0.9885 0.9534 0.9534 | |
auto 0.9544 1.0447 0.9770 0.9544 0.9544 | |
manual 0.9544 1.0595 0.9848 0.9544 0.9544 | |
manual 0.9483 1.1425 1.0478 0.9628 0.9483 | |
manual 0.9490 1.0937 1.0067 0.9516 0.9490 | |
manual 0.9491 1.2303 1.0864 0.9728 0.9491 | |
manual 0.9465 1.1122 1.0206 0.9520 0.9465 | |
""" | |
# from docopt import docopt | |
from observations import ptb | |
from os import environ | |
from pathlib import Path | |
from tensorflow.config import experimental_connect_to_cluster | |
from tensorflow.data import Dataset | |
from tensorflow.distribute import OneDeviceStrategy | |
from tensorflow.distribute.cluster_resolver import TPUClusterResolver | |
from tensorflow.distribute.experimental import TPUStrategy | |
from tensorflow.keras import Model, losses, metrics | |
from tensorflow.keras.layers import * | |
from tensorflow.keras.optimizers import * | |
from tensorflow.tpu.experimental import initialize_tpu_system | |
from time import time | |
import tensorflow as tf | |
def compute_and_apply_gradients(model, x, y): | |
with tf.GradientTape() as tape: | |
y_hat = model(x, training = True) | |
loss = model.compiled_loss(y, y_hat, | |
regularization_losses = model.losses) | |
vars = model.trainable_variables | |
grads = tape.gradient(loss, vars) | |
grads = [tf.clip_by_norm(g, 0.5) for g in grads] | |
model.optimizer.apply_gradients(zip(grads, vars)) | |
return y_hat | |
class MyModel(Model): | |
def train_step(self, data): | |
x, y = data | |
y_hat = compute_and_apply_gradients(self, x, y) | |
self.compiled_metrics.update_state(y, y_hat) | |
return {m.name: m.result() for m in self.metrics} | |
def create_model(seq_len, vocab_size): | |
inp = Input(shape = (seq_len,), batch_size = None, dtype = tf.int32) | |
emb = Embedding(input_dim = vocab_size, output_dim = 100) | |
lstm = LSTM(700, return_sequences = True, dropout = 0.3) | |
time_dist = TimeDistributed(Dense(vocab_size, activation = 'softmax')) | |
out = time_dist(lstm(emb(inp))) | |
return MyModel(inputs = [inp], outputs = [out]) | |
def sequence_to_samples(seq, seq_len): | |
def split_input_target(chunk): | |
input_text = chunk[:-1] | |
target_text = chunk[1:] | |
return input_text, target_text | |
src = tf.constant(seq, dtype = tf.int32) | |
return Dataset.from_tensor_slices(src) \ | |
.batch(seq_len + 1, drop_remainder = True) \ | |
.map(split_input_target) | |
def select_strategy(): | |
tpu_addr = environ.get('COLAB_TPU_ADDR') | |
if not tpu_addr: | |
return OneDeviceStrategy(device = "/cpu:0") | |
resolver = TPUClusterResolver('grpc://' + tpu_addr) | |
experimental_connect_to_cluster(resolver) | |
initialize_tpu_system(resolver) | |
return TPUStrategy(resolver) | |
def distribute_dataset(strategy, dataset, batch_size): | |
def dataset_fn(ctx): | |
return dataset.batch(batch_size, drop_remainder = True) | |
return strategy.experimental_distribute_datasets_from_function( | |
dataset_fn) | |
class LossAccObserver: | |
def __init__(self): | |
self.loss = metrics.SparseCategoricalCrossentropy() | |
self.acc = metrics.SparseCategoricalAccuracy() | |
def reset(self): | |
self.loss.reset_states() | |
self.acc.reset_states() | |
def update(self, y, y_hat): | |
self.loss.update_state(y, y_hat) | |
self.acc.update_state(y, y_hat) | |
@tf.function | |
def train_epoch(model, strategy, batch_size, dataset, obs): | |
def step_fn(x, y): | |
y_hat = compute_and_apply_gradients(model, x, y) | |
obs.update(y, y_hat) | |
for x, y in dataset: | |
strategy.run(step_fn, args = (x, y)) | |
@tf.function | |
def evaluate_epoch(model, strategy, dataset, obs): | |
def step_fn(x, y): | |
y_hat = model(x, training = False) | |
obs.update(y, y_hat) | |
for x, y in dataset: | |
strategy.run(step_fn, args = (x, y)) | |
def manual_training(model, strategy, train, valid, batch_size, epochs): | |
with strategy.scope(): | |
train_obs = LossAccObserver() | |
valid_obs = LossAccObserver() | |
batch_size_per_replica = batch_size // strategy.num_replicas_in_sync | |
train = distribute_dataset(strategy, train, batch_size_per_replica) | |
valid = distribute_dataset(strategy, valid, batch_size_per_replica) | |
fmt = '\-> %3d / %3d - %4db - %3ds - %.4f / %.4f - %.2f / %.2f %s' | |
val_losses = [] | |
last_time = time() | |
last_n_steps = 0 | |
for i in range(epochs): | |
start = time() | |
train_epoch(model, strategy, batch_size, train, train_obs) | |
evaluate_epoch(model, strategy, valid, valid_obs) | |
new_time = time() | |
val_loss = valid_obs.loss.result() | |
new_n_steps = model.optimizer.iterations.numpy() | |
time_delta = new_time - last_time | |
n_steps_delta = new_n_steps - last_n_steps | |
mark = ' ' | |
if val_loss < min(val_losses, default = 100): | |
mark = '*' | |
args = (i + 1, epochs, n_steps_delta, time_delta, | |
train_obs.loss.result(), val_loss, | |
train_obs.acc.result(), valid_obs.acc.result(), mark) | |
print(fmt % args) | |
last_time = new_time | |
last_n_steps = new_n_steps | |
val_losses.append(val_loss) | |
train_obs.reset() | |
valid_obs.reset() | |
def automatic_training(model, train, valid, batch_size, epochs): | |
train = train.batch(batch_size, drop_remainder = True) | |
valid = valid.batch(batch_size, drop_remainder = True) | |
model.fit(x = train, validation_data = valid, | |
epochs = epochs, | |
verbose = 2) | |
def main(): | |
# Parameters. | |
# args = docopt(__doc__, version = 'Char-based LM in TF 1.0') | |
# batch_size = int(args['--batch-size']) | |
# seq_len = int(args['--seq-len']) | |
# epochs = int(args['--epochs']) | |
# manual_mode = True if args['manual'] else False | |
batch_size = 32 | |
seq_len = 320 | |
epochs = 200 | |
manual_mode = True | |
# Select strategy | |
strategy = select_strategy() | |
# Load and transform data. | |
train, _, valid = ptb('./data') | |
ix2ch = sorted(set(train)) | |
ch2ix = {c : i for i, c in enumerate(ix2ch)} | |
train = sequence_to_samples([ch2ix[c] for c in train], seq_len) | |
valid = sequence_to_samples([ch2ix[c] for c in valid], seq_len) | |
vocab_size = len(ix2ch) | |
# Create model and optimizer. | |
with strategy.scope(): | |
model = create_model(seq_len, vocab_size) | |
model.compile( | |
optimizer = SGD(learning_rate = 4), | |
loss = 'sparse_categorical_crossentropy', | |
metrics = ['sparse_categorical_accuracy']) | |
if manual_mode: | |
manual_training(model, strategy, train, valid, batch_size, epochs) | |
else: | |
automatic_training(model, train, valid, batch_size, epochs) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment