Created
April 16, 2018 13:56
-
-
Save FoConrad/29a51cdfa58c51cdab4df8e902d10207 to your computer and use it in GitHub Desktop.
Weight divergence between TF and MX when using Adam optimizer from same initial weights
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import numpy as np # Version 1.13.1 | |
import tensorflow as tf # Version 1.4.1 | |
import mxnet as mx # Version 1.1.0 | |
from mxnet import gluon, nd, autograd | |
# From openai baselines common/tf_util | |
def dense(x, size, name, weight_init=None, bias=True): | |
w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init) | |
ret = tf.matmul(x, w) | |
if bias: | |
b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer()) | |
return ret + b | |
else: | |
return ret | |
class MLPPolicy_tf(object): | |
def __init__(self, name, data): | |
with tf.variable_scope(name): | |
hidden = tf.nn.tanh(dense(data, 128, "polfc1", | |
weight_init=tf.initializers.ones)) | |
self.logits = dense(hidden, 10, "polfinal", tf.initializers.ones) | |
self.scope = tf.get_variable_scope().name | |
# From openai baselines common/distributions | |
def logp(self, x): | |
# return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) | |
# Note: we can't use sparse_softmax_cross_entropy_with_logits because | |
# the implementation does not allow second-order derivatives... | |
one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1]) | |
return - tf.nn.softmax_cross_entropy_with_logits( | |
logits=self.logits, | |
labels=one_hot_actions) | |
def get_variables(self): | |
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) | |
def get_trainable_variables(self): | |
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) | |
class MLPPolicy_mx(object): | |
def __init__(self): | |
self.policy = gluon.nn.Sequential() | |
self.policy.add(gluon.nn.Dense(128, activation="tanh")) | |
self.policy.add(gluon.nn.Dense(10)) | |
self.policy.collect_params().initialize(mx.initializer.Xavier()) | |
self.ce_loss = gluon.loss.SoftmaxCrossEntropyLoss() | |
def forward(self, input_): | |
logits = self.policy(input_) | |
return logits | |
def logp(self, logits, actions): | |
# Tried many methods of calculating this with lower level operators | |
return -self.ce_loss(logits, actions) | |
def assign_parameters(self, other_params): | |
for my_par, o_par in zip(self.policy.collect_params().values(), | |
other_params.values()): | |
my_par.set_data(o_par.data()) | |
def tf_objective(tf_pi, tf_oldpi, tf_labels, tf_atarg, clip_param, simple): | |
if simple: | |
# Simple objective, slow to diverge | |
tf_pred = tf_pi.logp(tf_labels) | |
pol_surr = - tf.reduce_mean(tf_pred * tf_atarg) | |
else: | |
# PPO policy objective | |
new_logits = tf_pi.logp(tf_labels) | |
old_logits = tf_oldpi.logp(tf_labels) | |
ratio = tf.exp(new_logits - old_logits) | |
surr1 = ratio * tf_atarg # surrogate from conservative policy iteration | |
surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * tf_atarg | |
# PPO's pessimistic surrogate (L^CLIP) | |
pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) | |
return pol_surr | |
def update(data, labels, atarg, cliprange_now, pi, oldpi, policy_trainer, | |
simple): | |
advantages = nd.array(atarg) | |
with autograd.record(): | |
old_logp = oldpi.logp(oldpi.forward(data), labels) | |
new_logp = pi.logp(pi.forward(data), labels) | |
if simple: | |
# Simple objective, slow to diverge | |
new_pred = pi.forward(data) | |
actor_loss = -nd.mean(pi.logp(new_pred, labels) * advantages) | |
else: | |
# Action loss | |
ratio = nd.exp(new_logp - old_logp) | |
surr1 = ratio * advantages | |
surr2 = nd.clip(ratio, 1.0 - cliprange_now, 1.0 + cliprange_now) * advantages | |
actor_loss = -nd.mean(nd.minimum(surr1, surr2)) | |
# Compute gradients and updates | |
actor_loss.backward() | |
grads = [] | |
for name, param in iter(pi.policy.collect_params().items()): | |
grads.append(param.grad().copy().asnumpy().transpose()) | |
policy_trainer.step(1) | |
return actor_loss.asscalar(), grads | |
def learn(simple_objective, optimizer_choice): | |
clip_param = 0.2 | |
optim_stepsize = 1e-3 | |
adam_epsilon = 1e-5 | |
mx.random.seed(0) | |
np.random.seed(0) | |
batch_size = 32 | |
mnist = mx.test_utils.get_mnist() | |
mnist_length = len(mnist['train_data']) | |
mx_pi = MLPPolicy_mx() | |
mx_oldpi = MLPPolicy_mx() | |
# List of optimizers for experiments. Can't seem to get Adagrad, RMSProp, etc | |
# to work as MXNet and Tensorflow don't agree on hyper-parameters | |
if optimizer_choice == 'adam': | |
mx_trainer= gluon.Trainer(mx_pi.policy.collect_params(), 'adam', | |
{'learning_rate': optim_stepsize, 'epsilon': adam_epsilon}) | |
elif optimizer_choice == 'sgd': | |
mx_trainer = gluon.Trainer(mx_pi.policy.collect_params(), 'sgd', | |
{'learning_rate': optim_stepsize}) | |
elif optimizer_choice == 'momentum': | |
mx_trainer = gluon.Trainer(mx_pi.policy.collect_params(), 'sgd', | |
{'learning_rate': optim_stepsize, 'momentum': 0.9}) | |
else: | |
raise ValueError('Please chose valid optimizer choice') | |
# TF stuff | |
sess = tf.Session(config=tf.ConfigProto( | |
inter_op_parallelism_threads=1, | |
intra_op_parallelism_threads=1)) | |
sess.__enter__() | |
# Setup loss etc. (TF ops) | |
# ---------------------------------------- | |
tf_data = tf.placeholder(name="data", dtype=tf.float32, shape=[None, 784]) | |
tf_labels = tf.placeholder(name="labels", dtype=tf.int32, shape=[None]) | |
tf_atarg = tf.placeholder(name="atarg", dtype=tf.float32, shape=[None]) | |
tf_pi = MLPPolicy_tf("pi", tf_data) | |
tf_oldpi = MLPPolicy_tf("oldpi", tf_data) | |
tf_loss_op = tf_objective(tf_pi, tf_oldpi, tf_labels, tf_atarg, clip_param, | |
simple=simple_objective) | |
var_list = tf_pi.get_trainable_variables() | |
# List of optimizers for experiments. Can't seem to get Adagrad, RMSProp, etc | |
# to work as MXNet and Tensorflow don't agree on hyper-parameters | |
if optimizer_choice == 'adam': | |
tf_optimizer = tf.train.AdamOptimizer(learning_rate=optim_stepsize, | |
epsilon=adam_epsilon) | |
elif optimizer_choice == 'sgd': | |
tf_optimizer = tf.train.GradientDescentOptimizer( | |
learning_rate=optim_stepsize) | |
else: | |
tf_optimizer = tf.train.MomentumOptimizer(learning_rate=optim_stepsize, | |
momentum=0.9) | |
tf_opt_grads = tf_optimizer.compute_gradients(tf_loss_op, var_list) | |
tf_learner = tf_optimizer.minimize(tf_loss_op, var_list=var_list) | |
# Initialize tensorflow variables, although we don't care about initial | |
# weights. But we do want to assign to them | |
new_variables = set(tf.global_variables()) | |
sess.run(tf.variables_initializer(new_variables)) | |
init_vars = tf.initialize_variables(tf_pi.get_trainable_variables()) | |
# Init mxnet parameters | |
mx_pi.forward(nd.array(mnist['train_data'][:128].reshape((128, 784)))) | |
# Assign mxnet initialization to tensorflow | |
def mx_to_tf(mxND): | |
return mxND.data().asnumpy().transpose() | |
for m_val, t_val in zip(mx_pi.policy.collect_params().values(), | |
tf_pi.get_trainable_variables()): | |
sess.run(t_val.assign(mx_to_tf(m_val))) | |
update_every = [100, 100] | |
while True: | |
train_iter = mx.io.NDArrayIter(mnist['train_data'], | |
mnist['train_label'], batch_size, shuffle=True) | |
for (pi_v, oldpi_v) in zip(tf_pi.get_variables(), tf_oldpi.get_variables()): | |
sess.run(oldpi_v.assign(pi_v)) | |
mx_oldpi.assign_parameters(mx_pi.policy.collect_params()) | |
for iter_ in train_iter: | |
data = iter_.data[0].reshape((iter_.data[0].shape[0], 784)) | |
labels = iter_.label[0] | |
fake_atarg = (np.random.rand(batch_size) - 0.5) | |
# Make a step with mxnet | |
mx_loss, mx_gradients = update(data, labels, fake_atarg, clip_param, | |
mx_pi, mx_oldpi, mx_trainer, simple=simple_objective) | |
# Fill tensorflow placeholders | |
_feed_dict = {tf_data: data.asnumpy(), tf_labels: labels.asnumpy(), | |
tf_atarg: fake_atarg} | |
# Get the gradients that tensorflow calculated to compare with mxnet | |
tf_loss = sess.run(tf_loss_op, feed_dict=_feed_dict) | |
tf_grads = sess.run([grad[0] for grad in tf_opt_grads], feed_dict=_feed_dict) | |
# Make a step with tensorflow | |
sess.run(tf_learner, feed_dict=_feed_dict) | |
# Keep track of gradients and weight values | |
update_every[1] -= 1 | |
if update_every[1] == 0: | |
update_every[1] = update_every[0] | |
print('MX loss {}, TF loss {}, diff {}'.format(mx_loss, tf_loss, | |
abs(mx_loss - tf_loss))) | |
for m_wgt, t_wgt, mx_grad, tf_grad in zip( | |
mx_pi.policy.collect_params().values(), | |
tf_pi.get_trainable_variables(), mx_gradients, | |
tf_grads): | |
grad_diff = mx_grad - tf_grad | |
weight_diff = mx_to_tf(m_wgt) - t_wgt.eval(session=sess) | |
print('{} with shape {} '.format(t_wgt.name, mx_grad.shape)) | |
print("Abs sum diff grad", np.sum(np.abs(grad_diff))) | |
print("Abs sum diff weight", np.sum(np.abs(weight_diff))) | |
print() | |
print('\n') | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
# Experiment parameters | |
parser.add_argument('--optimizer', default='adam', choices=['adam', 'sgd', 'momentum'], | |
help='which optimizer to use') | |
parser.add_argument('--simple-loss', action='store_true', | |
help='use simple loss function instead of PPO-like one') | |
args = parser.parse_args() | |
learn(optimizer_choice=args.optimizer, simple_objective=args.simple_loss) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment