kmjjacobs/char_seq2seq.py

## char_seq2seq.py
# Code for https://www.data-blogger.com/2017/12/14/create-a-character-based-seq2seq-using-python-and-tensorflow/

# Imports
import tensorflow as tf
import pandas as pd
from nltk import word_tokenize
from collections import Counter
import numpy as np

# Load the data
with open('hamlet.txt', 'r') as input_file:
    words_raw = input_file.read()

# Load the words
words = word_tokenize(words_raw)
words = [word.lower() for word in words]
chars = list(' '.join(words))
char_counts = Counter(chars)

# Add a PAD token (to fill up empty spaces), EOW token (for signaling the End-Of-Word) and UNK token (for unknown characters)
# Also use the 100 most frequent characters
vocab = ['PAD', 'EOW', 'UNK'] + [char for char, count in char_counts.most_common(100)]
vocab = {char: i for i, char in enumerate(vocab)}
vocab_size = len(vocab)

# Configuration
input_embedding_size = 20
encoder_hidden_units = 16

# Create the graph
tf.reset_default_graph()

# Create the encoder and decoder
encoder_inputs = tf.placeholder(shape=[None, None], dtype=tf.int32, name='encoder_inputs')
encoder_inputs_length = tf.placeholder(shape=[None,], dtype=tf.int32, name='encoder_input_length')
decoder_targets = tf.placeholder(shape=[None, None], dtype=tf.int32, name='decoder_targets')
embeddings = tf.Variable(tf.random_uniform([vocab_size, input_embedding_size], -1., 1.), dtype=tf.float32)
encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)
encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_hidden_units)
((encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_final_state, encoder_bw_final_state)) = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell, cell_bw=encoder_cell, inputs=encoder_inputs_embedded, sequence_length=encoder_inputs_length, dtype=tf.float32, time_major=True)

# Okay, short notice: I use a stacked LSTM to encode both forward information (prefixes) and backward information (suffixes)
encoder_final_state_h = tf.concat([encoder_fw_final_state.h, encoder_fw_final_state.h], axis=1)
encoder_final_state_c = tf.concat([encoder_fw_final_state.c, encoder_bw_final_state.c], axis=1)
encoder_final_state = tf.contrib.rnn.LSTMStateTuple(
    c=encoder_final_state_c,
    h=encoder_final_state_h
)

# The decoder
decoder_hidden_units = 2 * encoder_hidden_units
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(decoder_hidden_units)
encoder_max_time, batch_size = tf.unstack(tf.shape(encoder_inputs))
# 3 is arbitrary, the point is to make the decoder longer than the encoder
decoder_lengths = encoder_inputs_length + 3

# Output projection (convert the internal state vector to character representation)
W = tf.Variable(tf.random_uniform([decoder_hidden_units, vocab_size], -1, 1), dtype=tf.float32)
b = tf.Variable(tf.zeros([vocab_size]), dtype=tf.float32)

# A quick sanity check
assert vocab['EOW'] == 1 and vocab['PAD'] == 0 and vocab['UNK'] == 2

# Setup the input for the decoder
eow_time_slice = tf.ones([batch_size], dtype=tf.int32, name='EOW')
pad_time_slice = tf.zeros([batch_size], dtype=tf.int32, name='PAD')

eow_step_embedded = tf.nn.embedding_lookup(embeddings, eow_time_slice)
pad_step_embedded = tf.nn.embedding_lookup(embeddings, pad_time_slice)

# We create the decoder RNN ourselves and add soft-attention to it
# By defining the behaviour of an RNN cell, an initializer and transition function need to be specified

def loop_fn_initial():
    initial_elements_finished = (0 >= decoder_lengths)
    initial_input = eow_step_embedded
    initial_cell_state = encoder_final_state
    initial_cell_output = None
    initial_loop_state = None
    return (initial_elements_finished,
           initial_input,
           initial_cell_state,
           initial_cell_output,
           initial_loop_state)

def loop_fn_transition(time, previous_output, previous_state, previous_loop_state):
    # soft attention: trivial form of attention

    def get_next_input():
        output_logits = tf.add(tf.matmul(previous_output, W), b)
        # THIS IS ATTENTION (the "picking" step is attention: choose what to devote attention to)
        prediction = tf.argmax(output_logits, axis=1)
        next_input = tf.nn.embedding_lookup(embeddings, prediction)
        return next_input

    elements_finished = (time >= decoder_lengths)

    finished = tf.reduce_all(elements_finished)
    input = tf.cond(finished, lambda: pad_step_embedded, get_next_input)

    state = previous_state
    output = previous_output
    loop_state = None

    return (elements_finished, input, state, output, loop_state)

# Now we combine both the initializer and transition function
def loop_fn(time, previous_output, previous_state, previous_loop_state):
    if previous_state is None:
        assert previous_output is None and previous_state is None
        return loop_fn_initial()
    else:
        return loop_fn_transition(time, previous_output, previous_state, previous_loop_state)

# We can now create the decoder RNN
decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn(decoder_cell, loop_fn)
decoder_outputs = decoder_outputs_ta.stack()

# Now we convert the state vectors to a human-readable representation using the weights (W) and biases (b) defined earlier
decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack(tf.shape(decoder_outputs))
decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_dim))
decoder_logits_flat = tf.add(tf.matmul(decoder_outputs_flat, W), b)
decoder_logits = tf.reshape(decoder_logits_flat, (decoder_max_steps, decoder_batch_size, vocab_size))

# And using this greedy approach, we select the most likely character:
decoder_prediction = tf.argmax(decoder_logits, 2)

# Now we can define the loss and create an optimizer
stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32),
    logits=decoder_logits,
)

loss = tf.reduce_mean(stepwise_cross_entropy)
train_op = tf.train.AdamOptimizer().minimize(loss)

# Setup the session
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

# A helper function for creating batches of data
def create_batch(inputs, max_sequence_length=None):
    """
    Args:
        inputs:
            list of sentences (integer lists)
        max_sequence_length:
            integer specifying how large should `max_time` dimension be.
            If None, maximum sequence length would be used

    Outputs:
        inputs_time_major:
            input sentences transformed into time-major matrix
            (shape [max_time, batch_size]) padded with 0s
        sequence_lengths:
            batch-sized list of integers specifying amount of active
            time steps in each input sequence
    """

    sequence_lengths = [len(seq) for seq in inputs]
    batch_size = len(inputs)

    if max_sequence_length is None:
        max_sequence_length = max(sequence_lengths)

    inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32) # == PAD

    for i, seq in enumerate(inputs):
        for j, element in enumerate(seq):
            inputs_batch_major[i, j] = element

    # [batch_size, max_time] -> [max_time, batch_size]
    inputs_time_major = inputs_batch_major.swapaxes(0, 1)

    return inputs_time_major, sequence_lengths

# A helper function to convert characters to ids
def word_to_ids(word):
    return [vocab.get(c, vocab['UNK']) for c in word]

# And a helper function to generate a feed dict which converts the data such that it can be fed into the network
def get_feed_dict(words, lengths=None):
    encoder_inputs_, encoder_input_lengths_ = create_batch([word_to_ids(seq) for seq in words])
    decoder_targets_, _ = create_batch([word_to_ids(seq) + [vocab['EOW']] + [vocab['PAD']] * 2 for seq in words])
    if lengths is not None:
        encoder_input_lengths = lengths

    return {
        encoder_inputs: encoder_inputs_,
        encoder_inputs_length: encoder_input_lengths_,
        decoder_targets: decoder_targets_
    }

# Now we can train!
batch_size = 10
batches_in_epoch = 100
print(len(words) // batch_size)
vocab_inv = {index: word for word, index in vocab.items()}
for repeats in range(3):
    for batch in range(0, len(words) // batch_size):
        i = batch * batch_size
        j = (batch + 1) * batch_size
        fd = get_feed_dict(words[i:j])
        _, l = sess.run([train_op, loss], fd)
        if batch == 0 or batch % batches_in_epoch == 0:
            print('batch {}'.format(batch))
            print('  minibatch loss: {}'.format(sess.run(loss, fd)))
            predict_ = sess.run(decoder_prediction, fd)
            for i, (inp, pred) in enumerate(zip(fd[encoder_inputs].T, predict_.T)):
                inp_txt = ''.join([vocab_inv.get(item, '?') for item in inp if item >= 3])
                pred_txt = ''.join([vocab_inv.get(item, '?') for item in pred if item >= 3])
                print('  sample {}:'.format(i + 1))
                print('    input         > {}'.format(inp))
                print('    predicted     > {}'.format(pred))
                print('    input_txt     > {}'.format(inp_txt))
                print('    predicted_txt > {}'.format(pred_txt))
                if i >= 2:
                    break
            print()
	# Code for https://www.data-blogger.com/2017/12/14/create-a-character-based-seq2seq-using-python-and-tensorflow/

	# Imports
	import tensorflow as tf
	import pandas as pd
	from nltk import word_tokenize
	from collections import Counter
	import numpy as np

	# Load the data
	with open('hamlet.txt', 'r') as input_file:
	words_raw = input_file.read()

	# Load the words
	words = word_tokenize(words_raw)
	words = [word.lower() for word in words]
	chars = list(' '.join(words))
	char_counts = Counter(chars)

	# Add a PAD token (to fill up empty spaces), EOW token (for signaling the End-Of-Word) and UNK token (for unknown characters)
	# Also use the 100 most frequent characters
	vocab = ['PAD', 'EOW', 'UNK'] + [char for char, count in char_counts.most_common(100)]
	vocab = {char: i for i, char in enumerate(vocab)}
	vocab_size = len(vocab)

	# Configuration
	input_embedding_size = 20
	encoder_hidden_units = 16

	# Create the graph
	tf.reset_default_graph()

	# Create the encoder and decoder
	encoder_inputs = tf.placeholder(shape=[None, None], dtype=tf.int32, name='encoder_inputs')
	encoder_inputs_length = tf.placeholder(shape=[None,], dtype=tf.int32, name='encoder_input_length')
	decoder_targets = tf.placeholder(shape=[None, None], dtype=tf.int32, name='decoder_targets')
	embeddings = tf.Variable(tf.random_uniform([vocab_size, input_embedding_size], -1., 1.), dtype=tf.float32)
	encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)
	encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_hidden_units)
	((encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_final_state, encoder_bw_final_state)) = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell, cell_bw=encoder_cell, inputs=encoder_inputs_embedded, sequence_length=encoder_inputs_length, dtype=tf.float32, time_major=True)

	# Okay, short notice: I use a stacked LSTM to encode both forward information (prefixes) and backward information (suffixes)
	encoder_final_state_h = tf.concat([encoder_fw_final_state.h, encoder_fw_final_state.h], axis=1)
	encoder_final_state_c = tf.concat([encoder_fw_final_state.c, encoder_bw_final_state.c], axis=1)
	encoder_final_state = tf.contrib.rnn.LSTMStateTuple(
	c=encoder_final_state_c,
	h=encoder_final_state_h
	)

	# The decoder
	decoder_hidden_units = 2 * encoder_hidden_units
	decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(decoder_hidden_units)
	encoder_max_time, batch_size = tf.unstack(tf.shape(encoder_inputs))
	# 3 is arbitrary, the point is to make the decoder longer than the encoder
	decoder_lengths = encoder_inputs_length + 3

	# Output projection (convert the internal state vector to character representation)
	W = tf.Variable(tf.random_uniform([decoder_hidden_units, vocab_size], -1, 1), dtype=tf.float32)
	b = tf.Variable(tf.zeros([vocab_size]), dtype=tf.float32)

	# A quick sanity check
	assert vocab['EOW'] == 1 and vocab['PAD'] == 0 and vocab['UNK'] == 2

	# Setup the input for the decoder
	eow_time_slice = tf.ones([batch_size], dtype=tf.int32, name='EOW')
	pad_time_slice = tf.zeros([batch_size], dtype=tf.int32, name='PAD')

	eow_step_embedded = tf.nn.embedding_lookup(embeddings, eow_time_slice)
	pad_step_embedded = tf.nn.embedding_lookup(embeddings, pad_time_slice)

	# We create the decoder RNN ourselves and add soft-attention to it
	# By defining the behaviour of an RNN cell, an initializer and transition function need to be specified

	def loop_fn_initial():
	initial_elements_finished = (0 >= decoder_lengths)
	initial_input = eow_step_embedded
	initial_cell_state = encoder_final_state
	initial_cell_output = None
	initial_loop_state = None
	return (initial_elements_finished,
	initial_input,
	initial_cell_state,
	initial_cell_output,
	initial_loop_state)

	def loop_fn_transition(time, previous_output, previous_state, previous_loop_state):
	# soft attention: trivial form of attention

	def get_next_input():
	output_logits = tf.add(tf.matmul(previous_output, W), b)
	# THIS IS ATTENTION (the "picking" step is attention: choose what to devote attention to)
	prediction = tf.argmax(output_logits, axis=1)
	next_input = tf.nn.embedding_lookup(embeddings, prediction)
	return next_input

	elements_finished = (time >= decoder_lengths)

	finished = tf.reduce_all(elements_finished)
	input = tf.cond(finished, lambda: pad_step_embedded, get_next_input)

	state = previous_state
	output = previous_output
	loop_state = None

	return (elements_finished, input, state, output, loop_state)

	# Now we combine both the initializer and transition function
	def loop_fn(time, previous_output, previous_state, previous_loop_state):
	if previous_state is None:
	assert previous_output is None and previous_state is None
	return loop_fn_initial()
	else:
	return loop_fn_transition(time, previous_output, previous_state, previous_loop_state)

	# We can now create the decoder RNN
	decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn(decoder_cell, loop_fn)
	decoder_outputs = decoder_outputs_ta.stack()

	# Now we convert the state vectors to a human-readable representation using the weights (W) and biases (b) defined earlier
	decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack(tf.shape(decoder_outputs))
	decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_dim))
	decoder_logits_flat = tf.add(tf.matmul(decoder_outputs_flat, W), b)
	decoder_logits = tf.reshape(decoder_logits_flat, (decoder_max_steps, decoder_batch_size, vocab_size))

	# And using this greedy approach, we select the most likely character:
	decoder_prediction = tf.argmax(decoder_logits, 2)

	# Now we can define the loss and create an optimizer
	stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
	labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32),
	logits=decoder_logits,
	)

	loss = tf.reduce_mean(stepwise_cross_entropy)
	train_op = tf.train.AdamOptimizer().minimize(loss)

	# Setup the session
	sess = tf.InteractiveSession()
	sess.run(tf.global_variables_initializer())

	# A helper function for creating batches of data
	def create_batch(inputs, max_sequence_length=None):
	"""
	Args:
	inputs:
	list of sentences (integer lists)
	max_sequence_length:
	integer specifying how large should `max_time` dimension be.
	If None, maximum sequence length would be used

	Outputs:
	inputs_time_major:
	input sentences transformed into time-major matrix
	(shape [max_time, batch_size]) padded with 0s
	sequence_lengths:
	batch-sized list of integers specifying amount of active
	time steps in each input sequence
	"""

	sequence_lengths = [len(seq) for seq in inputs]
	batch_size = len(inputs)

	if max_sequence_length is None:
	max_sequence_length = max(sequence_lengths)

	inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32) # == PAD

	for i, seq in enumerate(inputs):
	for j, element in enumerate(seq):
	inputs_batch_major[i, j] = element

	# [batch_size, max_time] -> [max_time, batch_size]
	inputs_time_major = inputs_batch_major.swapaxes(0, 1)

	return inputs_time_major, sequence_lengths

	# A helper function to convert characters to ids
	def word_to_ids(word):
	return [vocab.get(c, vocab['UNK']) for c in word]

	# And a helper function to generate a feed dict which converts the data such that it can be fed into the network
	def get_feed_dict(words, lengths=None):
	encoder_inputs_, encoder_input_lengths_ = create_batch([word_to_ids(seq) for seq in words])
	decoder_targets_, _ = create_batch([word_to_ids(seq) + [vocab['EOW']] + [vocab['PAD']] * 2 for seq in words])
	if lengths is not None:
	encoder_input_lengths = lengths

	return {
	encoder_inputs: encoder_inputs_,
	encoder_inputs_length: encoder_input_lengths_,
	decoder_targets: decoder_targets_
	}

	# Now we can train!
	batch_size = 10
	batches_in_epoch = 100
	print(len(words) // batch_size)
	vocab_inv = {index: word for word, index in vocab.items()}
	for repeats in range(3):
	for batch in range(0, len(words) // batch_size):
	i = batch * batch_size
	j = (batch + 1) * batch_size
	fd = get_feed_dict(words[i:j])
	_, l = sess.run([train_op, loss], fd)
	if batch == 0 or batch % batches_in_epoch == 0:
	print('batch {}'.format(batch))
	print(' minibatch loss: {}'.format(sess.run(loss, fd)))
	predict_ = sess.run(decoder_prediction, fd)
	for i, (inp, pred) in enumerate(zip(fd[encoder_inputs].T, predict_.T)):
	inp_txt = ''.join([vocab_inv.get(item, '?') for item in inp if item >= 3])
	pred_txt = ''.join([vocab_inv.get(item, '?') for item in pred if item >= 3])
	print(' sample {}:'.format(i + 1))
	print(' input > {}'.format(inp))
	print(' predicted > {}'.format(pred))
	print(' input_txt > {}'.format(inp_txt))
	print(' predicted_txt > {}'.format(pred_txt))
	if i >= 2:
	break
	print()