Skip to content

Instantly share code, notes, and snippets.

@fogside
Last active March 19, 2017 09:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fogside/f4378fc998215deac39e77052104a498 to your computer and use it in GitHub Desktop.
Save fogside/f4378fc998215deac39e77052104a498 to your computer and use it in GitHub Desktop.
## Задание 1 -- нужно было просто сделать только одно матричное умножение в имплементации ячейки lstm:
in_mtx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes*4], -0.1, 0.1))
out_mtx = tf.Variable(tf.truncated_normal([num_nodes, num_nodes*4], -0.1, 0.1))
b_vec = tf.Variable(tf.zeros([1, num_nodes*4]))
# Variables saving state across unrollings.
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
def lstm_cell(i, o, state):
"""Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
Note that in this formulation, we omit the various connections between the
previous state and the gates."""
'in product_tmp consequentially: input_gate, forget_gate, update, output_gate'
product_tmp = tf.matmul(i, in_mtx) + tf.matmul(o, out_mtx) + b_vec
input_gate, forget_gate, output_gate, update = tf.split(product_tmp, num_or_size_splits=4, axis=1)
input_gate = tf.sigmoid(input_gate)
forget_gate = tf.sigmoid(forget_gate)
output_gate = tf.sigmoid(output_gate)
state = forget_gate * state + input_gate * tf.tanh(update)
return output_gate * tf.tanh(state), state
### Для сравнения оригинальная версия:
# Parameters:
# Input gate: input, previous output, and bias.
ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
ib = tf.Variable(tf.zeros([1, num_nodes]))
# Forget gate: input, previous output, and bias.
fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
fb = tf.Variable(tf.zeros([1, num_nodes]))
# Memory cell: input, state and bias.
cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
cb = tf.Variable(tf.zeros([1, num_nodes]))
# Output gate: input, previous output, and bias.
ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
ob = tf.Variable(tf.zeros([1, num_nodes]))
# Variables saving state across unrollings.
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
# Definition of the cell computation.
def lstm_cell(i, o, state):
"""Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
Note that in this formulation, we omit the various connections between the
previous state and the gates."""
input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
state = forget_gate * state + input_gate * tf.tanh(update)
output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
return output_gate * tf.tanh(state), state
### Задание 2 -- добавить Dropout и сделать все на биграмах ###
### Создаем для биграм свою таблицу эмбеддингов;
### Предсказываем по-прежнему по-одному символу, но на вход даем по 2 символа;
### Дропоут мы делаем только на входе и выходе из ячейки, но не на промежуточных состояниях;
# Original text problem:
# We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.
# a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.
# b- Write a bigram-based LSTM, modeled on the character LSTM above.
# c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this article.
###################################################################################################
num_nodes = 64
embedding_size = 128
keep_prob = 0.5 # The probability that each element is kept;
# the same for input and output;
graph = tf.Graph()
with graph.as_default():
# Parameters:
# Input gate: input, previous output, and bias.
in_mtx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes*4], -0.1, 0.1))
out_mtx = tf.Variable(tf.truncated_normal([num_nodes, num_nodes*4], -0.1, 0.1))
b_vec = tf.Variable(tf.zeros([1, num_nodes*4]))
# Variables saving state across unrollings.
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
# Classifier weights and biases.
w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
b = tf.Variable(tf.zeros([vocabulary_size]))
embeddings_mtx = tf.Variable(tf.truncated_normal([vocabulary_size*vocabulary_size, embedding_size], -0.1, 0.1), trainable=True)
# Definition of the cell computation.
def lstm_cell(i, o, state):
"""Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
Note that in this formulation, we omit the various connections between the
previous state and the gates."""
'in product_tmp consequentially: input_gate, forget_gate, update, output_gate'
product_tmp = tf.matmul(i, in_mtx) + tf.matmul(o, out_mtx) + b_vec
input_gate, forget_gate, output_gate, update = tf.split(product_tmp, num_or_size_splits=4, axis=1)
input_gate = tf.sigmoid(input_gate)
forget_gate = tf.sigmoid(forget_gate)
output_gate = tf.sigmoid(output_gate)
state = forget_gate * state + input_gate * tf.tanh(update)
output = output_gate * tf.tanh(state)
return output, state
# Input data.
train_data = list()
for _ in range(num_unrollings + 1):
train_data.append(
tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
train_tmp = train_data[:num_unrollings]
train_inputs = zip(train_tmp[:-1], train_tmp[1:]) #creating bigrams
train_labels = train_data[2:] # labels are inputs shifted by one time step.
# Unrolled LSTM loop.
outputs = list()
output_dropouted = saved_output
state = saved_state
for i in train_inputs:
input_idx = tf.argmax(i[0], dimension=1)*vocabulary_size + tf.argmax(i[1], dimension=1)
current_input = tf.nn.embedding_lookup(embeddings_mtx, input_idx)
input_dropouted = tf.nn.dropout(current_input, keep_prob) #### Dropout на входе;
output, state = lstm_cell(input_dropouted, output_dropouted, state)
output_dropouted = tf.nn.dropout(output, keep_prob) #### Dropout на выходе;
outputs.append(output) #### При этом все равно сохраняем исходный output;
# State saving across unrollings.
with tf.control_dependencies([saved_output.assign(output),
saved_state.assign(state)]):
# Classifier.
logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(
labels=tf.concat(train_labels, 0), logits=logits))
# Optimizer.
global_step = tf.Variable(0)
#### tf.train.exponential_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)
#### decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
learning_rate = tf.train.exponential_decay(
10.0, global_step, 5000, 0.1, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
optimizer = optimizer.apply_gradients(
zip(gradients, v), global_step=global_step)
# Predictions.
train_prediction = tf.nn.softmax(logits)
# Sampling and validation eval: batch 1, no unrolling.
sample_input = [tf.placeholder(tf.float32, shape=[1, vocabulary_size]),
tf.placeholder(tf.float32, shape=[1, vocabulary_size])]
bigrams_idx = tf.argmax(sample_input[0], dimension=1)*vocabulary_size + tf.argmax(sample_input[1], dimension=1)
sample_embeddings = tf.nn.embedding_lookup(embeddings_mtx, bigrams_idx)
saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
reset_sample_state = tf.group(
saved_sample_output.assign(tf.zeros([1, num_nodes])),
saved_sample_state.assign(tf.zeros([1, num_nodes])))
sample_output, sample_state = lstm_cell(
sample_embeddings, saved_sample_output, saved_sample_state)
with tf.control_dependencies([saved_sample_output.assign(sample_output),
saved_sample_state.assign(sample_state)]):
sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
################################
import collections
num_steps = 7001
summary_frequency = 100
valid_batches = BatchGenerator(valid_text, 1, 2) # batch_size = 1, num_unrollings = 2
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print('Initialized')
mean_loss = 0
for step in range(num_steps):
batches = train_batches.next()
feed_dict = dict()
for i in range(num_unrollings + 1):
feed_dict[train_data[i]] = batches[i]
_, l, predictions, lr = session.run(
[optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
mean_loss += l
if step % summary_frequency == 0:
if step > 0:
mean_loss = mean_loss / summary_frequency
# The mean loss is an estimate of the loss over the last few batches.
print(
'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
mean_loss = 0
labels = np.concatenate(list(batches)[2:]) ## because of bigrams
print('Minibatch perplexity: %.2f' % float(
np.exp(logprob(predictions, labels))))
if step % (summary_frequency * 10) == 0:
# Generate some samples.
print('=' * 80)
for _ in range(5):
# feed = (sample(random_distribution()), sample(random_distribution())) ## прошлая реализация
feed = collections.deque(maxlen=2) ## очень удобно использовать такую структуру данных
for _ in range(2): ## т.к. там всегда хранятся только 2 символа, то при добавлении следующего,
feed.append(sample(random_distribution())) ## самый первый уходит
sentence = characters(feed[0])[0]+characters(feed[1])[0]
reset_sample_state.run()
for _ in range(79):
prediction = sample_prediction.eval({sample_input[0]: feed[0], sample_input[1]: feed[1]})
feed.append(sample(prediction)) #the first value will be replaced by the next
sentence += characters(feed[1])[0]
print(sentence)
print('=' * 80)
# Measure validation set perplexity.
reset_sample_state.run()
valid_logprob = 0
for _ in range(valid_size):
b = valid_batches.next()
predictions = sample_prediction.eval({sample_input[0]: b[0], sample_input[1]: b[1]})
valid_logprob = valid_logprob + logprob(predictions, b[2]) ## теперь мы проверяем уже на 3-ем символе, а не на 2-м,потому что кормим биграмы
print('Validation set perplexity: %.2f' % float(np.exp(
valid_logprob / valid_size)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment