Skip to content

Instantly share code, notes, and snippets.

@lgeiger
Created May 7, 2018 10:55
Show Gist options
  • Save lgeiger/10a3b1a0b94b52bc64d14d949ad74595 to your computer and use it in GitHub Desktop.
Save lgeiger/10a3b1a0b94b52bc64d14d949ad74595 to your computer and use it in GitHub Desktop.
Training GANs with Optimism using Tensorflow (https://github.com/vsyrgkanis/optimistic_GAN_training/)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.python.eager import context
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.framework import ops
from tensorflow.python.training import optimizer
class AdamirrorOptimizer(optimizer.Optimizer):
def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
use_locking=False, name="Adamirror"):
super(AdamirrorOptimizer, self).__init__(use_locking, name)
self._lr = learning_rate
self._beta1 = beta1
self._beta2 = beta2
self._epsilon = epsilon
# Tensor versions of the constructor arguments, created in _prepare().
self._lr_t = None
self._beta1_t = None
self._beta2_t = None
self._epsilon_t = None
def _get_beta_accumulators(self):
if context.executing_eagerly():
graph = None
else:
graph = ops.get_default_graph()
return (self._get_non_slot_variable("beta1_power", graph=graph),
self._get_non_slot_variable("beta2_power", graph=graph))
def _create_slots(self, var_list):
# Create the beta1 and beta2 accumulators on the same device as the first
# variable. Sort the var_list to make sure this device is consistent across
# workers (these need to go on the same PS, otherwise some updates are
# silently ignored).
first_var = min(var_list, key=lambda x: x.name)
self._create_non_slot_variable(initial_value=self._beta1,
name="beta1_power",
colocate_with=first_var)
self._create_non_slot_variable(initial_value=self._beta2,
name="beta2_power",
colocate_with=first_var)
# Create slots for the first and second moments.
for v in var_list:
self._zeros_slot(v, "m", self._name)
self._zeros_slot(v, "v", self._name)
def _prepare(self):
self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate")
self._beta1_t = ops.convert_to_tensor(self._beta1, name="beta1")
self._beta2_t = ops.convert_to_tensor(self._beta2, name="beta2")
self._epsilon_t = ops.convert_to_tensor(self._epsilon, name="epsilon")
def _apply_dense(self, grad, var):
beta1_power, beta2_power = self._get_beta_accumulators()
beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
lr = lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)
m = self.get_slot(var, "m")
v = self.get_slot(var, "v")
old_update = lr * m / (math_ops.sqrt(v) + epsilon_t)
var_update = state_ops.assign_add(var, old_update, use_locking=self._use_locking)
with ops.control_dependencies([var_update]):
m_t = state_ops.assign(m,
m * beta1_t + (grad * (1 - beta1_t)),
use_locking=self._use_locking)
v_t = state_ops.assign(v,
v * beta2_t + ((grad * grad) * (1 - beta2_t)),
use_locking=self._use_locking)
var_update = state_ops.assign_sub(var,
2 * lr * m_t / (math_ops.sqrt(v_t) + epsilon_t),
use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t])
def _resource_apply_dense(self, grad, handle):
return self._apply_dense(grad, handle)
def _finish(self, update_ops, name_scope):
# Update the power accumulators.
with ops.control_dependencies(update_ops):
beta1_power, beta2_power = self._get_beta_accumulators()
with ops.colocate_with(beta1_power):
update_beta1 = beta1_power.assign(beta1_power * self._beta1_t, use_locking=self._use_locking)
update_beta2 = beta2_power.assign(beta2_power * self._beta2_t, use_locking=self._use_locking)
return control_flow_ops.group(*update_ops + [update_beta1, update_beta2], name=name_scope)
"""Tests for Adamirror."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from tensorflow.python.client import session
from tensorflow.python.eager import context
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.framework import test_util
from tensorflow.python.ops import resource_variable_ops
from tensorflow.python.ops import variables
from tensorflow.python.platform import test
from .adamirror import AdamirrorOptimizer
def adam_update_numpy(param, g_t, t, m, v, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
alpha_t_1 = alpha * np.sqrt(1 - beta2**(t - 1)) / (1 - beta1**(t - 1)) if t > 1 else alpha_t
m_t = beta1 * m + (1 - beta1) * g_t
v_t = beta2 * v + (1 - beta2) * g_t * g_t
param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon) + alpha_t_1 * m / (np.sqrt(v) + epsilon)
return param_t, m_t, v_t
class AdamirrorOptimizerTest(test.TestCase):
def doTestBasic(self, use_resource=False):
for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
with self.test_session(graph=ops.Graph()):
# Initialize variables for numpy implementation.
m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
epsilon = 1e-7 if dtype == dtypes.half else 1e-8
if use_resource:
var0 = resource_variable_ops.ResourceVariable(var0_np, name="var0_%d" % i)
var1 = resource_variable_ops.ResourceVariable(var1_np, name="var1_%d" % i)
else:
var0 = variables.Variable(var0_np)
var1 = variables.Variable(var1_np)
grads0 = constant_op.constant(grads0_np)
grads1 = constant_op.constant(grads1_np)
opt = AdamirrorOptimizer(epsilon=epsilon)
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
opt_variables = opt.variables()
beta1_power, beta2_power = opt._get_beta_accumulators()
self.assertTrue(beta1_power is not None)
self.assertTrue(beta2_power is not None)
self.assertIn(beta1_power, opt_variables)
self.assertIn(beta2_power, opt_variables)
with ops.Graph().as_default():
# Shouldn't return non-slot variables from other graphs.
self.assertEqual(0, len(opt.variables()))
if not context.executing_eagerly():
self.evaluate(variables.global_variables_initializer())
# Fetch params to validate initial values
self.assertAllClose([1.0, 2.0], self.evaluate(var0))
self.assertAllClose([3.0, 4.0], self.evaluate(var1))
beta1_power, beta2_power = opt._get_beta_accumulators()
# Run 3 steps of Adam
for t in range(1, 4):
if not context.executing_eagerly():
self.evaluate(update)
elif t > 1:
opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
self.assertAllCloseAccordingToType(0.9**(t + 1), self.evaluate(beta1_power))
self.assertAllCloseAccordingToType(0.999**(t + 1), self.evaluate(beta2_power))
var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0, epsilon=epsilon)
var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1, epsilon=epsilon)
# Validate updated params
self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
if use_resource:
self.assertEqual("var0_%d/Adamirror:0" % (i,),
opt.get_slot(var=var0, name="m").name)
def testBasic(self):
with self.test_session():
self.doTestBasic(use_resource=False)
@test_util.run_in_graph_and_eager_modes(reset_test=True)
def testResourceBasic(self):
self.doTestBasic(use_resource=True)
def testTensorLearningRate(self):
for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
with self.test_session():
# Initialize variables for numpy implementation.
m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
epsilon = 1e-7 if dtype == dtypes.half else 1e-8
var0 = variables.Variable(var0_np)
var1 = variables.Variable(var1_np)
grads0 = constant_op.constant(grads0_np)
grads1 = constant_op.constant(grads1_np)
opt = AdamirrorOptimizer(constant_op.constant(0.001), epsilon=epsilon)
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
variables.global_variables_initializer().run()
# Fetch params to validate initial values
self.assertAllClose([1.0, 2.0], var0.eval())
self.assertAllClose([3.0, 4.0], var1.eval())
beta1_power, beta2_power = opt._get_beta_accumulators()
# Run 3 steps of Adam
for t in range(1, 4):
self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
update.run()
var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0, epsilon=epsilon)
var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1, epsilon=epsilon)
# Validate updated params
self.assertAllCloseAccordingToType(var0_np, var0.eval())
self.assertAllCloseAccordingToType(var1_np, var1.eval())
def testSharing(self):
for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
with self.test_session():
# Initialize variables for numpy implementation.
m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
epsilon = 1e-7 if dtype == dtypes.half else 1e-8
var0 = variables.Variable(var0_np)
var1 = variables.Variable(var1_np)
grads0 = constant_op.constant(grads0_np)
grads1 = constant_op.constant(grads1_np)
opt = AdamirrorOptimizer(epsilon=epsilon)
update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
variables.global_variables_initializer().run()
beta1_power, beta2_power = opt._get_beta_accumulators()
# Fetch params to validate initial values
self.assertAllClose([1.0, 2.0], var0.eval())
self.assertAllClose([3.0, 4.0], var1.eval())
# Run 3 steps of intertwined Adam1 and Adam2.
for t in range(1, 4):
self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
if t % 2 == 0:
update1.run()
else:
update2.run()
var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0, epsilon=epsilon)
var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1, epsilon=epsilon)
# Validate updated params
self.assertAllCloseAccordingToType(var0_np, var0.eval())
self.assertAllCloseAccordingToType(var1_np, var1.eval())
def testTwoSessions(self):
optimizer = AdamirrorOptimizer()
g = ops.Graph()
with g.as_default():
with session.Session():
var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
grads0 = constant_op.constant(np.array([0.1, 0.1]))
optimizer.apply_gradients([(grads0, var0)])
gg = ops.Graph()
with gg.as_default():
with session.Session():
var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
grads0 = constant_op.constant(np.array([0.1, 0.1]))
# If the optimizer saves any state not keyed by graph the following line
# fails.
optimizer.apply_gradients([(grads0, var0)])
def testSlotsUniqueEager(self):
with context.eager_mode():
v1 = resource_variable_ops.ResourceVariable(1.)
v2 = resource_variable_ops.ResourceVariable(1.)
opt = AdamirrorOptimizer(1.)
opt.minimize(lambda: v1 + v2)
# There should be two non-slot variables, and two unique slot variables
# for v1 and v2 respectively.
self.assertEqual(6, len(set(opt.variables())))
if __name__ == "__main__":
test.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment