Skip to content

Instantly share code, notes, and snippets.

@Multihuntr
Last active August 10, 2018 08:25
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Multihuntr/b8cb68316842ff68cab3062740a2a730 to your computer and use it in GitHub Desktop.
Save Multihuntr/b8cb68316842ff68cab3062740a2a730 to your computer and use it in GitHub Desktop.
Accumulating gradients to reduce memory requirement per forward pass (using MNIST)
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
def simple_model(input):
# This ensures that the model will always be instantiated the same, for comparison.
hidden_initializer = tf.constant_initializer(np.random.uniform(-0.025, 0.025, size=[784,100]))
hidden = tf.layers.dense(input, 100, kernel_initializer=hidden_initializer)
out_initializer = tf.constant_initializer(np.random.uniform(-0.025, 0.025, size=[100,10]))
return tf.layers.dense(tf.nn.relu(hidden), 10, kernel_initializer=out_initializer)
inp = tf.placeholder(tf.float32, [None,784])
targ = tf.placeholder(tf.float32, [None,10])
# Define our divisor, used to normalise gradients across pseudo_batches
divisor = tf.Variable(0, trainable=False)
div_fl = tf.to_float(divisor)
reset_divisor = divisor.assign(0)
inc_divisor = divisor.assign(divisor+1)
# Make our model and optimizer and gradients
out = simple_model(inp)
opt = tf.train.GradientDescentOptimizer(learning_rate=1e-2)
loss = tf.losses.mean_squared_error(out, targ)
t_vars = tf.trainable_variables()
# compute gradients for a batch
grads, graph_vars = zip(*opt.compute_gradients(loss, t_vars))
# Accumulation ops and variables
# create a copy of all trainable variables with `0` as initial values
accum_grads = [tf.Variable(tf.zeros_like(t_var.initialized_value()), trainable=False) for t_var in t_vars]
# create a op to initialize all accums vars (and zero the divisor again)
with tf.control_dependencies([reset_divisor]):
zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_grads]
# Create ops for accumulating the gradient (also adds one to the final divisor)
with tf.control_dependencies([inc_divisor]):
accum_ops = [accum_grad.assign_add(grad) for (accum_grad, grad) in zip(accum_grads, grads)]
# Create op that updates the weights (also divides accumulated gradients by the number of steps)
normalised_accum_grads = [accum_grad/div_fl for (accum_grad) in accum_grads]
train_op = opt.apply_gradients(zip(normalised_accum_grads, graph_vars))
def graph_vars_equivalence():
'''
Simply ensures that the graph_vars returned by `opt.compute_gradients` is the full
set of trainable variables
'''
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True, seed=764847)
inp_, targ_ = mnist.train.next_batch(1)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
t_vars_ = sess.run(t_vars)
graph_vars_ = sess.run(graph_vars, {inp:inp_, targ: targ_})
for t, g in zip(t_vars_, graph_vars_):
assert t.shape == g.shape
# Must point to the same memory to pass
assert np.all(t == g), 'Graph vars is not the same as t_vars'
def initial_weights_same_after_reinit():
'''
Ensures that the weights are the same when we re-intialize the graph
'''
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
t_vars_1 = sess.run(t_vars)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
t_vars_2 = sess.run(t_vars)
for v1, v2 in zip(t_vars_1, t_vars_2):
assert np.all(v1 == v2), 'Weights not initialized the same'
def same_seed_gives_same_examples():
'''
Ensures that multiple runs of instantiating the dataset returns the same data
'''
mnist1 = input_data.read_data_sets("MNIST_data/", one_hot=True, seed=764847)
for x in range(10):
mnist1.train.next_batch(10)
inp_1, targ_1 = mnist1.train.next_batch(1)
mnist2 = input_data.read_data_sets("MNIST_data/", one_hot=True, seed=764847)
for x in range(100):
mnist2.train.next_batch(1)
inp_2, targ_2 = mnist2.train.next_batch(1)
assert np.all(inp_1 == inp_2), 'Batch size counts'
def direct_comp(batch_size):
'''
Directly compares the gradients of a standard forward pass with
several elements in a single batch to the accumulated gradients obtained
with several forward passes with individual batch elements.
If the accumulation method is working, then the accumulated gradients
at such a point should be approximately the same value as those calculated
from a standard forward pass with all elements at once.
'''
tf.set_random_seed(147258)
np.random.seed(123456)
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True, seed=764847)
inp_, targ_ = mnist.train.next_batch(batch_size)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i, t in zip(inp_, targ_):
sess.run(accum_ops, {inp: [i], targ: [t]})
accum_grads_ = sess.run(normalised_accum_grads)
standard_grads = sess.run(grads, {inp:inp_, targ: targ_})
for (i, (acc, sta)) in enumerate(zip(accum_grads_, standard_grads)):
diff = np.max(abs(acc - sta))
assert diff < 1e-7, 'Accumulated gradients out by at most {}'.format(diff)
def do_train(actual_batch, pseudo_batch, iterations=1000):
'''
Performs some number of steps of training and does some evaluation.
We expect that provided actual_batch*pseudo_batch doesn't change, then
neither should the final accuracy or final loss or final loss std
deviation.
'''
tf.set_random_seed(147258)
np.random.seed(123456)
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True, seed=764847)
total_sum = 0
total_sum_2 = 0
losses = []
n_correct = 0
n_incorrect = 0
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
counter = 0
accumulated_grads = []
# Train
for x in range(iterations):
# Apparently np.sum isn't compatible with native summing over multiple arrays
# So we always pull the same batch size, and split it as needed.
inp_, targ_ = mnist.train.next_batch(actual_batch*pseudo_batch)
total_sum += np.sum(inp_)
# This makes them into a list of examples, each example shaped [actual_batch, 784]
inp_ = np.split(inp_, np.arange(actual_batch, actual_batch*pseudo_batch, actual_batch))
targ_ = np.split(targ_, np.arange(actual_batch, actual_batch*pseudo_batch, actual_batch))
iteration_loss = 0
for y in range(pseudo_batch):
total_sum_2 += np.sum(inp_[y])
_, loss_ = sess.run((accum_ops, loss), {inp: inp_[y], targ: targ_[y]})
iteration_loss += loss_
sess.run(train_op)
sess.run(zero_ops)
losses.append(iteration_loss/pseudo_batch)
# Evaluate
for x in range(10):
inp_, targ_ = mnist.test.next_batch(128)
pred = sess.run(out, {inp: inp_})
comp = np.argmax(targ_, 1) == np.argmax(pred, 1)
c = np.count_nonzero(comp)
n_correct += c
n_incorrect += 128-c
total = n_correct + n_incorrect
prop_correct = n_correct/total*100
losses = np.array(losses)
print('Accuracy: {:5.3f}%, Loss: mean: {:8.6f}, std: {:8.6f}'.format(prop_correct, np.mean(losses), np.std(losses)))
print('Total sum (i.e. simplest hash): {}'.format(total_sum))
print('Total sum 2 (different summing): {}'.format(total_sum_2))
# Initial tests
graph_vars_equivalence()
initial_weights_same_after_reinit()
same_seed_gives_same_examples()
direct_comp(1)
direct_comp(10)
direct_comp(64)
print('All direct comparisons passed')
num_steps = 50
do_train(64, 1, num_steps)
do_train(1, 64, num_steps)
# do_train(1, 1, num_steps*64)
@Multihuntr
Copy link
Author

Multihuntr commented Nov 8, 2017

EDIT: Outdated information. See later comments.

The results of the different do_trains suggest that the gradient accumulation doesn't actually work.

1.
Accuracy:  86.016%
Loss: mean: 0.0477589 std: 0.0101381
2.
Accuracy:  83.281%
Loss: mean: 0.0473665 std: 0.0315746
3.
Accuracy:  83.359%
Loss: mean: 0.0467697 std: 0.0332775

@Sylvus
Copy link

Sylvus commented Dec 11, 2017

Alright, so 1. and 2. should produce the same results if the gradients are equal but 3. should produce different results because we are training with a batch size of 1, meaning our weights get updated 64 times as often.
At the moment I am under Windows with no GPU so I can't use Sonnet but I rerun this script with my own custom model like this:

def simple_model(inputs):
    with slim.arg_scope([slim.fully_connected],
                        activation_fn=tf.nn.relu,
                        weights_initializer=tf.truncated_normal_initializer(
                            0.0, 0.01), ):
        net = tf.reshape(inputs, (-1, 28 * 28))
        net = slim.fully_connected(net, 512)
        net = slim.dropout(net, keep_prob=0.8)
        net = slim.fully_connected(net, 512)
        net = slim.dropout(net, keep_prob=0.8)
        net = slim.fully_connected(net, 10, activation_fn=tf.nn.softmax)
        net = tf.reshape(net, (-1, 10))
    return net

a) The gradients do not match! I have to set the allowed difference to 1e-2 in order to not cause an assertion error.
b) I used 400 steps because I am inpatient and batch size 64:

num_steps = 400
do_train(64, 1, num_steps)
do_train(1, 64, num_steps)
do_train(1, 1, num_steps * 64)

I receive:

Accuracy: 81.094%
0.0665526 0.0172394
Accuracy: 82.266%
0.0662412 0.0274664
Accuracy: 73.359%
0.0516458 0.0466239

So I assume a smaller batch size is bad for this particular configuration. Furthermore, 3. is completely different (as predicted). But 1. and 2. seem similar. So maybe there is some random behaviour that we are not accounting for, but the overall idea is correct. What do you think?

// Edit: In my first post the numbers were low, fixed it by setting the learning_rate=0.5 and doing 400 iterations.

@Multihuntr
Copy link
Author

Multihuntr commented Jan 23, 2018

Sorry for not responding to this for over a month. I basically gave up on this line of inquiry for a while, thinking it was something deep in Tensorflow that would be causing it.

I've just come back to it, and realised a few mistakes with the original version that was causing it to not work. At least; measuring it wasn't working because:

  • I wasn't ensuring that it was getting the exact same data
  • I wasn't ensuring that the models were instantiated with the same weights
  • I wasn't adding up the losses correctly, resulting in overall larger std of the loss for pseudo_batches than was true

After fixing these it all ends up exactly how we expect it to (see edited gist)! Using your num_steps=50 and changing the model slightly I get:

Accuracy:  10.156%, Loss: mean: 0.098617,  std: 0.001016
Accuracy:  10.156%, Loss: mean: 0.098617,  std: 0.001016

I only had it at num_steps=1000 because I wanted to see if it converged to the same thing, statistically. But now that we can be sure that it is the same, numerically, we only need to do a few steps.

By the way; the gradients not matching within 1e-2 means something drastic, I should think. That's not just floating point arithmetic errors. I've not used slim before, but I'd be willing to bet that it's the dropout layers causing issues (since I don't think they will necessarily drop out the same thing in each case).

As you might've seen by running this, a batch of 2 batches of 32 examples will take a little less than 2x the time it takes to do a batch of 64 examples. A friend showed me this cool new technique for reducing memory recently, so if you have the technical know-how, that could be a better way to deal with memory constraints.

@Multihuntr
Copy link
Author

Multihuntr commented Jan 23, 2018

I also made a minimal example of it, without all the testing code.

But, you should be aware, it's not going to be quite the same if you use Momentum, Adam, Adagrad, Adadelta or really any other Optimizer. It should be mostly equivalent, but the numbers won't quite match, and the larger the pseudo_batch the more difference you would expect.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment