Skip to content

Instantly share code, notes, and snippets.

@FoConrad
Created April 16, 2018 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FoConrad/29a51cdfa58c51cdab4df8e902d10207 to your computer and use it in GitHub Desktop.
Save FoConrad/29a51cdfa58c51cdab4df8e902d10207 to your computer and use it in GitHub Desktop.
Weight divergence between TF and MX when using Adam optimizer from same initial weights
import argparse
import numpy as np # Version 1.13.1
import tensorflow as tf # Version 1.4.1
import mxnet as mx # Version 1.1.0
from mxnet import gluon, nd, autograd
# From openai baselines common/tf_util
def dense(x, size, name, weight_init=None, bias=True):
w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
ret = tf.matmul(x, w)
if bias:
b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer())
return ret + b
else:
return ret
class MLPPolicy_tf(object):
def __init__(self, name, data):
with tf.variable_scope(name):
hidden = tf.nn.tanh(dense(data, 128, "polfc1",
weight_init=tf.initializers.ones))
self.logits = dense(hidden, 10, "polfinal", tf.initializers.ones)
self.scope = tf.get_variable_scope().name
# From openai baselines common/distributions
def logp(self, x):
# return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
# Note: we can't use sparse_softmax_cross_entropy_with_logits because
# the implementation does not allow second-order derivatives...
one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
return - tf.nn.softmax_cross_entropy_with_logits(
logits=self.logits,
labels=one_hot_actions)
def get_variables(self):
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
def get_trainable_variables(self):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
class MLPPolicy_mx(object):
def __init__(self):
self.policy = gluon.nn.Sequential()
self.policy.add(gluon.nn.Dense(128, activation="tanh"))
self.policy.add(gluon.nn.Dense(10))
self.policy.collect_params().initialize(mx.initializer.Xavier())
self.ce_loss = gluon.loss.SoftmaxCrossEntropyLoss()
def forward(self, input_):
logits = self.policy(input_)
return logits
def logp(self, logits, actions):
# Tried many methods of calculating this with lower level operators
return -self.ce_loss(logits, actions)
def assign_parameters(self, other_params):
for my_par, o_par in zip(self.policy.collect_params().values(),
other_params.values()):
my_par.set_data(o_par.data())
def tf_objective(tf_pi, tf_oldpi, tf_labels, tf_atarg, clip_param, simple):
if simple:
# Simple objective, slow to diverge
tf_pred = tf_pi.logp(tf_labels)
pol_surr = - tf.reduce_mean(tf_pred * tf_atarg)
else:
# PPO policy objective
new_logits = tf_pi.logp(tf_labels)
old_logits = tf_oldpi.logp(tf_labels)
ratio = tf.exp(new_logits - old_logits)
surr1 = ratio * tf_atarg # surrogate from conservative policy iteration
surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * tf_atarg
# PPO's pessimistic surrogate (L^CLIP)
pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2))
return pol_surr
def update(data, labels, atarg, cliprange_now, pi, oldpi, policy_trainer,
simple):
advantages = nd.array(atarg)
with autograd.record():
old_logp = oldpi.logp(oldpi.forward(data), labels)
new_logp = pi.logp(pi.forward(data), labels)
if simple:
# Simple objective, slow to diverge
new_pred = pi.forward(data)
actor_loss = -nd.mean(pi.logp(new_pred, labels) * advantages)
else:
# Action loss
ratio = nd.exp(new_logp - old_logp)
surr1 = ratio * advantages
surr2 = nd.clip(ratio, 1.0 - cliprange_now, 1.0 + cliprange_now) * advantages
actor_loss = -nd.mean(nd.minimum(surr1, surr2))
# Compute gradients and updates
actor_loss.backward()
grads = []
for name, param in iter(pi.policy.collect_params().items()):
grads.append(param.grad().copy().asnumpy().transpose())
policy_trainer.step(1)
return actor_loss.asscalar(), grads
def learn(simple_objective, optimizer_choice):
clip_param = 0.2
optim_stepsize = 1e-3
adam_epsilon = 1e-5
mx.random.seed(0)
np.random.seed(0)
batch_size = 32
mnist = mx.test_utils.get_mnist()
mnist_length = len(mnist['train_data'])
mx_pi = MLPPolicy_mx()
mx_oldpi = MLPPolicy_mx()
# List of optimizers for experiments. Can't seem to get Adagrad, RMSProp, etc
# to work as MXNet and Tensorflow don't agree on hyper-parameters
if optimizer_choice == 'adam':
mx_trainer= gluon.Trainer(mx_pi.policy.collect_params(), 'adam',
{'learning_rate': optim_stepsize, 'epsilon': adam_epsilon})
elif optimizer_choice == 'sgd':
mx_trainer = gluon.Trainer(mx_pi.policy.collect_params(), 'sgd',
{'learning_rate': optim_stepsize})
elif optimizer_choice == 'momentum':
mx_trainer = gluon.Trainer(mx_pi.policy.collect_params(), 'sgd',
{'learning_rate': optim_stepsize, 'momentum': 0.9})
else:
raise ValueError('Please chose valid optimizer choice')
# TF stuff
sess = tf.Session(config=tf.ConfigProto(
inter_op_parallelism_threads=1,
intra_op_parallelism_threads=1))
sess.__enter__()
# Setup loss etc. (TF ops)
# ----------------------------------------
tf_data = tf.placeholder(name="data", dtype=tf.float32, shape=[None, 784])
tf_labels = tf.placeholder(name="labels", dtype=tf.int32, shape=[None])
tf_atarg = tf.placeholder(name="atarg", dtype=tf.float32, shape=[None])
tf_pi = MLPPolicy_tf("pi", tf_data)
tf_oldpi = MLPPolicy_tf("oldpi", tf_data)
tf_loss_op = tf_objective(tf_pi, tf_oldpi, tf_labels, tf_atarg, clip_param,
simple=simple_objective)
var_list = tf_pi.get_trainable_variables()
# List of optimizers for experiments. Can't seem to get Adagrad, RMSProp, etc
# to work as MXNet and Tensorflow don't agree on hyper-parameters
if optimizer_choice == 'adam':
tf_optimizer = tf.train.AdamOptimizer(learning_rate=optim_stepsize,
epsilon=adam_epsilon)
elif optimizer_choice == 'sgd':
tf_optimizer = tf.train.GradientDescentOptimizer(
learning_rate=optim_stepsize)
else:
tf_optimizer = tf.train.MomentumOptimizer(learning_rate=optim_stepsize,
momentum=0.9)
tf_opt_grads = tf_optimizer.compute_gradients(tf_loss_op, var_list)
tf_learner = tf_optimizer.minimize(tf_loss_op, var_list=var_list)
# Initialize tensorflow variables, although we don't care about initial
# weights. But we do want to assign to them
new_variables = set(tf.global_variables())
sess.run(tf.variables_initializer(new_variables))
init_vars = tf.initialize_variables(tf_pi.get_trainable_variables())
# Init mxnet parameters
mx_pi.forward(nd.array(mnist['train_data'][:128].reshape((128, 784))))
# Assign mxnet initialization to tensorflow
def mx_to_tf(mxND):
return mxND.data().asnumpy().transpose()
for m_val, t_val in zip(mx_pi.policy.collect_params().values(),
tf_pi.get_trainable_variables()):
sess.run(t_val.assign(mx_to_tf(m_val)))
update_every = [100, 100]
while True:
train_iter = mx.io.NDArrayIter(mnist['train_data'],
mnist['train_label'], batch_size, shuffle=True)
for (pi_v, oldpi_v) in zip(tf_pi.get_variables(), tf_oldpi.get_variables()):
sess.run(oldpi_v.assign(pi_v))
mx_oldpi.assign_parameters(mx_pi.policy.collect_params())
for iter_ in train_iter:
data = iter_.data[0].reshape((iter_.data[0].shape[0], 784))
labels = iter_.label[0]
fake_atarg = (np.random.rand(batch_size) - 0.5)
# Make a step with mxnet
mx_loss, mx_gradients = update(data, labels, fake_atarg, clip_param,
mx_pi, mx_oldpi, mx_trainer, simple=simple_objective)
# Fill tensorflow placeholders
_feed_dict = {tf_data: data.asnumpy(), tf_labels: labels.asnumpy(),
tf_atarg: fake_atarg}
# Get the gradients that tensorflow calculated to compare with mxnet
tf_loss = sess.run(tf_loss_op, feed_dict=_feed_dict)
tf_grads = sess.run([grad[0] for grad in tf_opt_grads], feed_dict=_feed_dict)
# Make a step with tensorflow
sess.run(tf_learner, feed_dict=_feed_dict)
# Keep track of gradients and weight values
update_every[1] -= 1
if update_every[1] == 0:
update_every[1] = update_every[0]
print('MX loss {}, TF loss {}, diff {}'.format(mx_loss, tf_loss,
abs(mx_loss - tf_loss)))
for m_wgt, t_wgt, mx_grad, tf_grad in zip(
mx_pi.policy.collect_params().values(),
tf_pi.get_trainable_variables(), mx_gradients,
tf_grads):
grad_diff = mx_grad - tf_grad
weight_diff = mx_to_tf(m_wgt) - t_wgt.eval(session=sess)
print('{} with shape {} '.format(t_wgt.name, mx_grad.shape))
print("Abs sum diff grad", np.sum(np.abs(grad_diff)))
print("Abs sum diff weight", np.sum(np.abs(weight_diff)))
print()
print('\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Experiment parameters
parser.add_argument('--optimizer', default='adam', choices=['adam', 'sgd', 'momentum'],
help='which optimizer to use')
parser.add_argument('--simple-loss', action='store_true',
help='use simple loss function instead of PPO-like one')
args = parser.parse_args()
learn(optimizer_choice=args.optimizer, simple_objective=args.simple_loss)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment