fogside/LSTM_1_2.py

## LSTM_1_2.py
## Задание 1 -- нужно было просто сделать только одно матричное умножение в имплементации ячейки lstm:

in_mtx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes*4], -0.1, 0.1))
out_mtx = tf.Variable(tf.truncated_normal([num_nodes, num_nodes*4], -0.1, 0.1))
b_vec = tf.Variable(tf.zeros([1, num_nodes*4]))

# Variables saving state across unrollings.
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)

def lstm_cell(i, o, state):
  """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
  Note that in this formulation, we omit the various connections between the
  previous state and the gates."""
  'in product_tmp consequentially: input_gate, forget_gate, update, output_gate'
  product_tmp = tf.matmul(i, in_mtx) + tf.matmul(o, out_mtx) + b_vec
  input_gate, forget_gate, output_gate, update = tf.split(product_tmp, num_or_size_splits=4, axis=1)
  input_gate  = tf.sigmoid(input_gate)
  forget_gate = tf.sigmoid(forget_gate)
  output_gate = tf.sigmoid(output_gate)
  state = forget_gate * state + input_gate * tf.tanh(update)
  return output_gate * tf.tanh(state), state

### Для сравнения оригинальная версия:

# Parameters:
# Input gate: input, previous output, and bias.
ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
ib = tf.Variable(tf.zeros([1, num_nodes]))
# Forget gate: input, previous output, and bias.
fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
fb = tf.Variable(tf.zeros([1, num_nodes]))
# Memory cell: input, state and bias.
cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
cb = tf.Variable(tf.zeros([1, num_nodes]))
# Output gate: input, previous output, and bias.
ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
ob = tf.Variable(tf.zeros([1, num_nodes]))
# Variables saving state across unrollings.
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)

# Definition of the cell computation.
def lstm_cell(i, o, state):
  """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
  Note that in this formulation, we omit the various connections between the
  previous state and the gates."""
  input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
  forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
  update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
  state = forget_gate * state + input_gate * tf.tanh(update)
  output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
  return output_gate * tf.tanh(state), state

### Задание 2 -- добавить Dropout и сделать все на биграмах ###
### Создаем для биграм свою таблицу эмбеддингов;
### Предсказываем по-прежнему по-одному символу, но на вход даем по 2 символа;
### Дропоут мы делаем только на входе и выходе из ячейки, но не на промежуточных состояниях;

# Original text problem:
# We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.
# a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.
# b- Write a bigram-based LSTM, modeled on the character LSTM above.
# c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this article.
###################################################################################################

num_nodes = 64
embedding_size = 128
keep_prob = 0.5 # The probability that each element is kept;
                # the same for input and output;
graph = tf.Graph()
with graph.as_default():

# Parameters:
# Input gate: input, previous output, and bias.
in_mtx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes*4], -0.1, 0.1))
out_mtx = tf.Variable(tf.truncated_normal([num_nodes, num_nodes*4], -0.1, 0.1))
b_vec = tf.Variable(tf.zeros([1, num_nodes*4]))

# Variables saving state across unrollings.
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
# Classifier weights and biases.
w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
b = tf.Variable(tf.zeros([vocabulary_size]))
embeddings_mtx = tf.Variable(tf.truncated_normal([vocabulary_size*vocabulary_size, embedding_size], -0.1, 0.1), trainable=True)

# Definition of the cell computation.
def lstm_cell(i, o, state):
  """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
  Note that in this formulation, we omit the various connections between the
  previous state and the gates."""
  'in product_tmp consequentially: input_gate, forget_gate, update, output_gate'
  product_tmp = tf.matmul(i, in_mtx) + tf.matmul(o, out_mtx) + b_vec
  input_gate, forget_gate, output_gate, update = tf.split(product_tmp, num_or_size_splits=4, axis=1)
  input_gate  = tf.sigmoid(input_gate)
  forget_gate = tf.sigmoid(forget_gate)
  output_gate = tf.sigmoid(output_gate)
  state = forget_gate * state + input_gate * tf.tanh(update)
  output = output_gate * tf.tanh(state)
  return output, state

# Input data.
train_data = list()
for _ in range(num_unrollings + 1):
  train_data.append(
    tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
train_tmp = train_data[:num_unrollings]
train_inputs = zip(train_tmp[:-1], train_tmp[1:]) #creating bigrams
train_labels = train_data[2:]  # labels are inputs shifted by one time step.

# Unrolled LSTM loop.
outputs = list()
output_dropouted = saved_output
state = saved_state
for i in train_inputs:
  input_idx = tf.argmax(i[0], dimension=1)*vocabulary_size + tf.argmax(i[1], dimension=1)
  current_input = tf.nn.embedding_lookup(embeddings_mtx, input_idx)
  input_dropouted = tf.nn.dropout(current_input, keep_prob) #### Dropout на входе;
  output, state = lstm_cell(input_dropouted, output_dropouted, state)
  output_dropouted = tf.nn.dropout(output, keep_prob) #### Dropout на выходе;
  outputs.append(output) #### При этом все равно сохраняем исходный output;

# State saving across unrollings.
with tf.control_dependencies([saved_output.assign(output),
                              saved_state.assign(state)]):
  # Classifier.
  logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(
      labels=tf.concat(train_labels, 0), logits=logits))

# Optimizer.
global_step = tf.Variable(0)
#### tf.train.exponential_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)
#### decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)

learning_rate = tf.train.exponential_decay(
  10.0, global_step, 5000, 0.1, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
optimizer = optimizer.apply_gradients(
  zip(gradients, v), global_step=global_step)

# Predictions.
train_prediction = tf.nn.softmax(logits)

# Sampling and validation eval: batch 1, no unrolling.
sample_input = [tf.placeholder(tf.float32, shape=[1, vocabulary_size]),
                tf.placeholder(tf.float32, shape=[1, vocabulary_size])]
bigrams_idx = tf.argmax(sample_input[0], dimension=1)*vocabulary_size + tf.argmax(sample_input[1], dimension=1)
sample_embeddings = tf.nn.embedding_lookup(embeddings_mtx, bigrams_idx)
saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
reset_sample_state = tf.group(
  saved_sample_output.assign(tf.zeros([1, num_nodes])),
  saved_sample_state.assign(tf.zeros([1, num_nodes])))
sample_output, sample_state = lstm_cell(
  sample_embeddings, saved_sample_output, saved_sample_state)
with tf.control_dependencies([saved_sample_output.assign(sample_output),
                              saved_sample_state.assign(sample_state)]):
  sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

################################


import collections
num_steps = 7001
summary_frequency = 100

valid_batches = BatchGenerator(valid_text, 1, 2) # batch_size = 1, num_unrollings = 2

with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print('Initialized')
mean_loss = 0
for step in range(num_steps):
  batches = train_batches.next()
  feed_dict = dict()
  for i in range(num_unrollings + 1):
    feed_dict[train_data[i]] = batches[i]
  _, l, predictions, lr = session.run(
    [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
  mean_loss += l
  if step % summary_frequency == 0:
    if step > 0:
      mean_loss = mean_loss / summary_frequency
    # The mean loss is an estimate of the loss over the last few batches.
    print(
      'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
    mean_loss = 0
    labels = np.concatenate(list(batches)[2:]) ## because of bigrams
    print('Minibatch perplexity: %.2f' % float(
      np.exp(logprob(predictions, labels))))
    if step % (summary_frequency * 10) == 0:

      # Generate some samples.
      print('=' * 80)
      for _ in range(5):
#           feed = (sample(random_distribution()), sample(random_distribution())) ## прошлая реализация
        feed = collections.deque(maxlen=2) ## очень удобно использовать такую структуру данных
        for _ in range(2):                 ## т.к. там всегда хранятся только 2 символа, то при добавлении следующего,
          feed.append(sample(random_distribution())) ## самый первый уходит
        sentence = characters(feed[0])[0]+characters(feed[1])[0]
        reset_sample_state.run()
        for _ in range(79):
          prediction = sample_prediction.eval({sample_input[0]: feed[0], sample_input[1]: feed[1]})
          feed.append(sample(prediction)) #the first value will be replaced by the next
          sentence += characters(feed[1])[0]
        print(sentence)
      print('=' * 80)
    # Measure validation set perplexity.
    reset_sample_state.run()
    valid_logprob = 0
    for _ in range(valid_size):
      b = valid_batches.next()
      predictions = sample_prediction.eval({sample_input[0]: b[0], sample_input[1]: b[1]})
      valid_logprob = valid_logprob + logprob(predictions, b[2]) ## теперь мы проверяем уже на 3-ем символе, а не на 2-м,потому что кормим биграмы
    print('Validation set perplexity: %.2f' % float(np.exp(
      valid_logprob / valid_size)))
	## Задание 1 -- нужно было просто сделать только одно матричное умножение в имплементации ячейки lstm:

	in_mtx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes*4], -0.1, 0.1))
	out_mtx = tf.Variable(tf.truncated_normal([num_nodes, num_nodes*4], -0.1, 0.1))
	b_vec = tf.Variable(tf.zeros([1, num_nodes*4]))

	# Variables saving state across unrollings.
	saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
	saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)

	def lstm_cell(i, o, state):
	"""Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
	Note that in this formulation, we omit the various connections between the
	previous state and the gates."""
	'in product_tmp consequentially: input_gate, forget_gate, update, output_gate'
	product_tmp = tf.matmul(i, in_mtx) + tf.matmul(o, out_mtx) + b_vec
	input_gate, forget_gate, output_gate, update = tf.split(product_tmp, num_or_size_splits=4, axis=1)
	input_gate = tf.sigmoid(input_gate)
	forget_gate = tf.sigmoid(forget_gate)
	output_gate = tf.sigmoid(output_gate)
	state = forget_gate * state + input_gate * tf.tanh(update)
	return output_gate * tf.tanh(state), state

	### Для сравнения оригинальная версия:

	# Parameters:
	# Input gate: input, previous output, and bias.
	ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
	im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
	ib = tf.Variable(tf.zeros([1, num_nodes]))
	# Forget gate: input, previous output, and bias.
	fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
	fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
	fb = tf.Variable(tf.zeros([1, num_nodes]))
	# Memory cell: input, state and bias.
	cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
	cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
	cb = tf.Variable(tf.zeros([1, num_nodes]))
	# Output gate: input, previous output, and bias.
	ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
	om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
	ob = tf.Variable(tf.zeros([1, num_nodes]))
	# Variables saving state across unrollings.
	saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
	saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)

	# Definition of the cell computation.
	def lstm_cell(i, o, state):
	"""Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
	Note that in this formulation, we omit the various connections between the
	previous state and the gates."""
	input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
	forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
	update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
	state = forget_gate * state + input_gate * tf.tanh(update)
	output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
	return output_gate * tf.tanh(state), state

	### Задание 2 -- добавить Dropout и сделать все на биграмах ###
	### Создаем для биграм свою таблицу эмбеддингов;
	### Предсказываем по-прежнему по-одному символу, но на вход даем по 2 символа;
	### Дропоут мы делаем только на входе и выходе из ячейки, но не на промежуточных состояниях;

	# Original text problem:
	# We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.
	# a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.
	# b- Write a bigram-based LSTM, modeled on the character LSTM above.
	# c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this article.
	###################################################################################################

	num_nodes = 64
	embedding_size = 128
	keep_prob = 0.5 # The probability that each element is kept;
	# the same for input and output;
	graph = tf.Graph()
	with graph.as_default():

	# Parameters:
	# Input gate: input, previous output, and bias.
	in_mtx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes*4], -0.1, 0.1))
	out_mtx = tf.Variable(tf.truncated_normal([num_nodes, num_nodes*4], -0.1, 0.1))
	b_vec = tf.Variable(tf.zeros([1, num_nodes*4]))

	# Variables saving state across unrollings.
	saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
	saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
	# Classifier weights and biases.
	w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
	b = tf.Variable(tf.zeros([vocabulary_size]))
	embeddings_mtx = tf.Variable(tf.truncated_normal([vocabulary_size*vocabulary_size, embedding_size], -0.1, 0.1), trainable=True)

	# Definition of the cell computation.
	def lstm_cell(i, o, state):
	"""Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
	Note that in this formulation, we omit the various connections between the
	previous state and the gates."""
	'in product_tmp consequentially: input_gate, forget_gate, update, output_gate'
	product_tmp = tf.matmul(i, in_mtx) + tf.matmul(o, out_mtx) + b_vec
	input_gate, forget_gate, output_gate, update = tf.split(product_tmp, num_or_size_splits=4, axis=1)
	input_gate = tf.sigmoid(input_gate)
	forget_gate = tf.sigmoid(forget_gate)
	output_gate = tf.sigmoid(output_gate)
	state = forget_gate * state + input_gate * tf.tanh(update)
	output = output_gate * tf.tanh(state)
	return output, state

	# Input data.
	train_data = list()
	for _ in range(num_unrollings + 1):
	train_data.append(
	tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
	train_tmp = train_data[:num_unrollings]
	train_inputs = zip(train_tmp[:-1], train_tmp[1:]) #creating bigrams
	train_labels = train_data[2:] # labels are inputs shifted by one time step.

	# Unrolled LSTM loop.
	outputs = list()
	output_dropouted = saved_output
	state = saved_state
	for i in train_inputs:
	input_idx = tf.argmax(i[0], dimension=1)*vocabulary_size + tf.argmax(i[1], dimension=1)
	current_input = tf.nn.embedding_lookup(embeddings_mtx, input_idx)
	input_dropouted = tf.nn.dropout(current_input, keep_prob) #### Dropout на входе;
	output, state = lstm_cell(input_dropouted, output_dropouted, state)
	output_dropouted = tf.nn.dropout(output, keep_prob) #### Dropout на выходе;
	outputs.append(output) #### При этом все равно сохраняем исходный output;

	# State saving across unrollings.
	with tf.control_dependencies([saved_output.assign(output),
	saved_state.assign(state)]):
	# Classifier.
	logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
	loss = tf.reduce_mean(
	tf.nn.softmax_cross_entropy_with_logits(
	labels=tf.concat(train_labels, 0), logits=logits))

	# Optimizer.
	global_step = tf.Variable(0)
	#### tf.train.exponential_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)
	#### decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)

	learning_rate = tf.train.exponential_decay(
	10.0, global_step, 5000, 0.1, staircase=True)
	optimizer = tf.train.GradientDescentOptimizer(learning_rate)
	gradients, v = zip(*optimizer.compute_gradients(loss))
	gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
	optimizer = optimizer.apply_gradients(
	zip(gradients, v), global_step=global_step)

	# Predictions.
	train_prediction = tf.nn.softmax(logits)

	# Sampling and validation eval: batch 1, no unrolling.
	sample_input = [tf.placeholder(tf.float32, shape=[1, vocabulary_size]),
	tf.placeholder(tf.float32, shape=[1, vocabulary_size])]
	bigrams_idx = tf.argmax(sample_input[0], dimension=1)*vocabulary_size + tf.argmax(sample_input[1], dimension=1)
	sample_embeddings = tf.nn.embedding_lookup(embeddings_mtx, bigrams_idx)
	saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
	saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
	reset_sample_state = tf.group(
	saved_sample_output.assign(tf.zeros([1, num_nodes])),
	saved_sample_state.assign(tf.zeros([1, num_nodes])))
	sample_output, sample_state = lstm_cell(
	sample_embeddings, saved_sample_output, saved_sample_state)
	with tf.control_dependencies([saved_sample_output.assign(sample_output),
	saved_sample_state.assign(sample_state)]):
	sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

	################################


	import collections
	num_steps = 7001
	summary_frequency = 100

	valid_batches = BatchGenerator(valid_text, 1, 2) # batch_size = 1, num_unrollings = 2

	with tf.Session(graph=graph) as session:
	tf.global_variables_initializer().run()
	print('Initialized')
	mean_loss = 0
	for step in range(num_steps):
	batches = train_batches.next()
	feed_dict = dict()
	for i in range(num_unrollings + 1):
	feed_dict[train_data[i]] = batches[i]
	_, l, predictions, lr = session.run(
	[optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
	mean_loss += l
	if step % summary_frequency == 0:
	if step > 0:
	mean_loss = mean_loss / summary_frequency
	# The mean loss is an estimate of the loss over the last few batches.
	print(
	'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
	mean_loss = 0
	labels = np.concatenate(list(batches)[2:]) ## because of bigrams
	print('Minibatch perplexity: %.2f' % float(
	np.exp(logprob(predictions, labels))))
	if step % (summary_frequency * 10) == 0:

	# Generate some samples.
	print('=' * 80)
	for _ in range(5):
	# feed = (sample(random_distribution()), sample(random_distribution())) ## прошлая реализация
	feed = collections.deque(maxlen=2) ## очень удобно использовать такую структуру данных
	for _ in range(2): ## т.к. там всегда хранятся только 2 символа, то при добавлении следующего,
	feed.append(sample(random_distribution())) ## самый первый уходит
	sentence = characters(feed[0])[0]+characters(feed[1])[0]
	reset_sample_state.run()
	for _ in range(79):
	prediction = sample_prediction.eval({sample_input[0]: feed[0], sample_input[1]: feed[1]})
	feed.append(sample(prediction)) #the first value will be replaced by the next
	sentence += characters(feed[1])[0]
	print(sentence)
	print('=' * 80)
	# Measure validation set perplexity.
	reset_sample_state.run()
	valid_logprob = 0
	for _ in range(valid_size):
	b = valid_batches.next()
	predictions = sample_prediction.eval({sample_input[0]: b[0], sample_input[1]: b[1]})
	valid_logprob = valid_logprob + logprob(predictions, b[2]) ## теперь мы проверяем уже на 3-ем символе, а не на 2-м,потому что кормим биграмы
	print('Validation set perplexity: %.2f' % float(np.exp(
	valid_logprob / valid_size)))