-
-
Save Multihuntr/b8cb68316842ff68cab3062740a2a730 to your computer and use it in GitHub Desktop.
import numpy as np | |
import tensorflow as tf | |
from tensorflow.examples.tutorials.mnist import input_data | |
def simple_model(input): | |
# This ensures that the model will always be instantiated the same, for comparison. | |
hidden_initializer = tf.constant_initializer(np.random.uniform(-0.025, 0.025, size=[784,100])) | |
hidden = tf.layers.dense(input, 100, kernel_initializer=hidden_initializer) | |
out_initializer = tf.constant_initializer(np.random.uniform(-0.025, 0.025, size=[100,10])) | |
return tf.layers.dense(tf.nn.relu(hidden), 10, kernel_initializer=out_initializer) | |
inp = tf.placeholder(tf.float32, [None,784]) | |
targ = tf.placeholder(tf.float32, [None,10]) | |
# Define our divisor, used to normalise gradients across pseudo_batches | |
divisor = tf.Variable(0, trainable=False) | |
div_fl = tf.to_float(divisor) | |
reset_divisor = divisor.assign(0) | |
inc_divisor = divisor.assign(divisor+1) | |
# Make our model and optimizer and gradients | |
out = simple_model(inp) | |
opt = tf.train.GradientDescentOptimizer(learning_rate=1e-2) | |
loss = tf.losses.mean_squared_error(out, targ) | |
t_vars = tf.trainable_variables() | |
# compute gradients for a batch | |
grads, graph_vars = zip(*opt.compute_gradients(loss, t_vars)) | |
# Accumulation ops and variables | |
# create a copy of all trainable variables with `0` as initial values | |
accum_grads = [tf.Variable(tf.zeros_like(t_var.initialized_value()), trainable=False) for t_var in t_vars] | |
# create a op to initialize all accums vars (and zero the divisor again) | |
with tf.control_dependencies([reset_divisor]): | |
zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_grads] | |
# Create ops for accumulating the gradient (also adds one to the final divisor) | |
with tf.control_dependencies([inc_divisor]): | |
accum_ops = [accum_grad.assign_add(grad) for (accum_grad, grad) in zip(accum_grads, grads)] | |
# Create op that updates the weights (also divides accumulated gradients by the number of steps) | |
normalised_accum_grads = [accum_grad/div_fl for (accum_grad) in accum_grads] | |
train_op = opt.apply_gradients(zip(normalised_accum_grads, graph_vars)) | |
def graph_vars_equivalence(): | |
''' | |
Simply ensures that the graph_vars returned by `opt.compute_gradients` is the full | |
set of trainable variables | |
''' | |
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True, seed=764847) | |
inp_, targ_ = mnist.train.next_batch(1) | |
with tf.Session() as sess: | |
sess.run(tf.global_variables_initializer()) | |
t_vars_ = sess.run(t_vars) | |
graph_vars_ = sess.run(graph_vars, {inp:inp_, targ: targ_}) | |
for t, g in zip(t_vars_, graph_vars_): | |
assert t.shape == g.shape | |
# Must point to the same memory to pass | |
assert np.all(t == g), 'Graph vars is not the same as t_vars' | |
def initial_weights_same_after_reinit(): | |
''' | |
Ensures that the weights are the same when we re-intialize the graph | |
''' | |
with tf.Session() as sess: | |
sess.run(tf.global_variables_initializer()) | |
t_vars_1 = sess.run(t_vars) | |
with tf.Session() as sess: | |
sess.run(tf.global_variables_initializer()) | |
t_vars_2 = sess.run(t_vars) | |
for v1, v2 in zip(t_vars_1, t_vars_2): | |
assert np.all(v1 == v2), 'Weights not initialized the same' | |
def same_seed_gives_same_examples(): | |
''' | |
Ensures that multiple runs of instantiating the dataset returns the same data | |
''' | |
mnist1 = input_data.read_data_sets("MNIST_data/", one_hot=True, seed=764847) | |
for x in range(10): | |
mnist1.train.next_batch(10) | |
inp_1, targ_1 = mnist1.train.next_batch(1) | |
mnist2 = input_data.read_data_sets("MNIST_data/", one_hot=True, seed=764847) | |
for x in range(100): | |
mnist2.train.next_batch(1) | |
inp_2, targ_2 = mnist2.train.next_batch(1) | |
assert np.all(inp_1 == inp_2), 'Batch size counts' | |
def direct_comp(batch_size): | |
''' | |
Directly compares the gradients of a standard forward pass with | |
several elements in a single batch to the accumulated gradients obtained | |
with several forward passes with individual batch elements. | |
If the accumulation method is working, then the accumulated gradients | |
at such a point should be approximately the same value as those calculated | |
from a standard forward pass with all elements at once. | |
''' | |
tf.set_random_seed(147258) | |
np.random.seed(123456) | |
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True, seed=764847) | |
inp_, targ_ = mnist.train.next_batch(batch_size) | |
with tf.Session() as sess: | |
sess.run(tf.global_variables_initializer()) | |
for i, t in zip(inp_, targ_): | |
sess.run(accum_ops, {inp: [i], targ: [t]}) | |
accum_grads_ = sess.run(normalised_accum_grads) | |
standard_grads = sess.run(grads, {inp:inp_, targ: targ_}) | |
for (i, (acc, sta)) in enumerate(zip(accum_grads_, standard_grads)): | |
diff = np.max(abs(acc - sta)) | |
assert diff < 1e-7, 'Accumulated gradients out by at most {}'.format(diff) | |
def do_train(actual_batch, pseudo_batch, iterations=1000): | |
''' | |
Performs some number of steps of training and does some evaluation. | |
We expect that provided actual_batch*pseudo_batch doesn't change, then | |
neither should the final accuracy or final loss or final loss std | |
deviation. | |
''' | |
tf.set_random_seed(147258) | |
np.random.seed(123456) | |
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True, seed=764847) | |
total_sum = 0 | |
total_sum_2 = 0 | |
losses = [] | |
n_correct = 0 | |
n_incorrect = 0 | |
with tf.Session() as sess: | |
sess.run(tf.global_variables_initializer()) | |
counter = 0 | |
accumulated_grads = [] | |
# Train | |
for x in range(iterations): | |
# Apparently np.sum isn't compatible with native summing over multiple arrays | |
# So we always pull the same batch size, and split it as needed. | |
inp_, targ_ = mnist.train.next_batch(actual_batch*pseudo_batch) | |
total_sum += np.sum(inp_) | |
# This makes them into a list of examples, each example shaped [actual_batch, 784] | |
inp_ = np.split(inp_, np.arange(actual_batch, actual_batch*pseudo_batch, actual_batch)) | |
targ_ = np.split(targ_, np.arange(actual_batch, actual_batch*pseudo_batch, actual_batch)) | |
iteration_loss = 0 | |
for y in range(pseudo_batch): | |
total_sum_2 += np.sum(inp_[y]) | |
_, loss_ = sess.run((accum_ops, loss), {inp: inp_[y], targ: targ_[y]}) | |
iteration_loss += loss_ | |
sess.run(train_op) | |
sess.run(zero_ops) | |
losses.append(iteration_loss/pseudo_batch) | |
# Evaluate | |
for x in range(10): | |
inp_, targ_ = mnist.test.next_batch(128) | |
pred = sess.run(out, {inp: inp_}) | |
comp = np.argmax(targ_, 1) == np.argmax(pred, 1) | |
c = np.count_nonzero(comp) | |
n_correct += c | |
n_incorrect += 128-c | |
total = n_correct + n_incorrect | |
prop_correct = n_correct/total*100 | |
losses = np.array(losses) | |
print('Accuracy: {:5.3f}%, Loss: mean: {:8.6f}, std: {:8.6f}'.format(prop_correct, np.mean(losses), np.std(losses))) | |
print('Total sum (i.e. simplest hash): {}'.format(total_sum)) | |
print('Total sum 2 (different summing): {}'.format(total_sum_2)) | |
# Initial tests | |
graph_vars_equivalence() | |
initial_weights_same_after_reinit() | |
same_seed_gives_same_examples() | |
direct_comp(1) | |
direct_comp(10) | |
direct_comp(64) | |
print('All direct comparisons passed') | |
num_steps = 50 | |
do_train(64, 1, num_steps) | |
do_train(1, 64, num_steps) | |
# do_train(1, 1, num_steps*64) |
Alright, so 1. and 2. should produce the same results if the gradients are equal but 3. should produce different results because we are training with a batch size of 1, meaning our weights get updated 64 times as often.
At the moment I am under Windows with no GPU so I can't use Sonnet but I rerun this script with my own custom model like this:
def simple_model(inputs):
with slim.arg_scope([slim.fully_connected],
activation_fn=tf.nn.relu,
weights_initializer=tf.truncated_normal_initializer(
0.0, 0.01), ):
net = tf.reshape(inputs, (-1, 28 * 28))
net = slim.fully_connected(net, 512)
net = slim.dropout(net, keep_prob=0.8)
net = slim.fully_connected(net, 512)
net = slim.dropout(net, keep_prob=0.8)
net = slim.fully_connected(net, 10, activation_fn=tf.nn.softmax)
net = tf.reshape(net, (-1, 10))
return net
a) The gradients do not match! I have to set the allowed difference to 1e-2 in order to not cause an assertion error.
b) I used 400 steps because I am inpatient and batch size 64:
num_steps = 400
do_train(64, 1, num_steps)
do_train(1, 64, num_steps)
do_train(1, 1, num_steps * 64)
I receive:
Accuracy: 81.094%
0.0665526 0.0172394
Accuracy: 82.266%
0.0662412 0.0274664
Accuracy: 73.359%
0.0516458 0.0466239
So I assume a smaller batch size is bad for this particular configuration. Furthermore, 3. is completely different (as predicted). But 1. and 2. seem similar. So maybe there is some random behaviour that we are not accounting for, but the overall idea is correct. What do you think?
// Edit: In my first post the numbers were low, fixed it by setting the learning_rate=0.5 and doing 400 iterations.
Sorry for not responding to this for over a month. I basically gave up on this line of inquiry for a while, thinking it was something deep in Tensorflow that would be causing it.
I've just come back to it, and realised a few mistakes with the original version that was causing it to not work. At least; measuring it wasn't working because:
- I wasn't ensuring that it was getting the exact same data
- I wasn't ensuring that the models were instantiated with the same weights
- I wasn't adding up the losses correctly, resulting in overall larger std of the loss for pseudo_batches than was true
After fixing these it all ends up exactly how we expect it to (see edited gist)! Using your num_steps=50
and changing the model slightly I get:
Accuracy: 10.156%, Loss: mean: 0.098617, std: 0.001016
Accuracy: 10.156%, Loss: mean: 0.098617, std: 0.001016
I only had it at num_steps=1000
because I wanted to see if it converged to the same thing, statistically. But now that we can be sure that it is the same, numerically, we only need to do a few steps.
By the way; the gradients not matching within 1e-2
means something drastic, I should think. That's not just floating point arithmetic errors. I've not used slim before, but I'd be willing to bet that it's the dropout layers causing issues (since I don't think they will necessarily drop out the same thing in each case).
As you might've seen by running this, a batch of 2 batches of 32 examples will take a little less than 2x the time it takes to do a batch of 64 examples. A friend showed me this cool new technique for reducing memory recently, so if you have the technical know-how, that could be a better way to deal with memory constraints.
I also made a minimal example of it, without all the testing code.
But, you should be aware, it's not going to be quite the same if you use Momentum, Adam, Adagrad, Adadelta or really any other Optimizer. It should be mostly equivalent, but the numbers won't quite match, and the larger the pseudo_batch
the more difference you would expect.
EDIT: Outdated information. See later comments.
The results of the different do_trains suggest that the gradient accumulation doesn't actually work.