Skip to content

Instantly share code, notes, and snippets.

@lhlmgr
Created October 26, 2018 09:01
Show Gist options
  • Save lhlmgr/f6709e5aba4a5314b5221d58232b09bd to your computer and use it in GitHub Desktop.
Save lhlmgr/f6709e5aba4a5314b5221d58232b09bd to your computer and use it in GitHub Desktop.
Graph-Mode vs. Eager-Mode comparison
#source: https://github.com/ericjang/normalizing-flows-tutorial/blob/master/nf_part1_intro.ipynb
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.contrib.eager as tfe
import time
tf.enable_eager_execution()
tfd = tfp.distributions
tfb = tfp.bijectors
# quite easy to interpret - multiplying by alpha causes a contraction in volume.
class LeakyReLU(tfb.Bijector):
def __init__(self, alpha=0.5, validate_args=False, name="leaky_relu"):
super(LeakyReLU, self).__init__(forward_min_event_ndims=1, inverse_min_event_ndims=1,
validate_args=validate_args, name=name)
self.alpha = alpha
def _forward(self, x):
return tf.where(tf.greater_equal(x, 0), x, self.alpha * x)
def _inverse(self, y):
return tf.where(tf.greater_equal(y, 0), y, 1. / self.alpha * y)
def _inverse_log_det_jacobian(self, y):
event_dims = self.inverse_min_event_ndims
I = tf.ones_like(y)
J_inv = tf.where(tf.greater_equal(y, 0), I, 1.0 / self.alpha * I)
# abs is actually redundant here, since this det Jacobian is > 0
log_abs_det_J_inv = tf.log(tf.abs(J_inv))
return tf.reduce_sum(log_abs_det_J_inv, axis=event_dims)
batch_size=512
DTYPE = tf.float32
class NFModel(tf.keras.Model):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def call(self, *args, **kwargs):
num_layers = 6
d, r = 2, 2
bijectors = []
for i in range(num_layers):
with tf.variable_scope('bijector_%d' % i):
V = tf.get_variable('V', [d, r], dtype=DTYPE) # factor loading
shift = tf.get_variable('shift', [d], dtype=DTYPE) # affine shift
L = tf.get_variable('L', [d * (d + 1) / 2], dtype=DTYPE) # lower triangular
bijectors.append(tfb.Affine(
scale_tril=tfd.fill_triangular(L),
scale_perturb_factor=V,
shift=shift,
))
alpha = tf.get_variable('alpha', [], dtype=DTYPE)
abs_alpha = tf.abs(alpha) + .01
bijectors.append(LeakyReLU(alpha=abs_alpha))
base_dist = tfd.MultivariateNormalDiag(loc=tf.zeros([2], DTYPE))
mlp_bijector = tfb.Chain(list(reversed(bijectors[:-1])), name='2d_mlp_bijector')
dist = tfd.TransformedDistribution(distribution=base_dist, bijector=mlp_bijector)
return {"dist": dist}
def loss_fn(model, inputs, targets):
result_dict = model(inputs)
loss = -tf.reduce_mean(result_dict["dist"].log_prob(x_samples))
return loss
def _grad_fn(model, inputs, targets):
with tf.GradientTape() as tape:
gradients = tfe.implicit_value_and_gradients(loss_fn)
return gradients(model, inputs, targets)
NUM_STEPS = int(1e5)
global_step = []
np_losses = []
with tf.device("/gpu:0"):
x2_dist = tfd.Normal(loc=0., scale=4.)
x2_samples = x2_dist.sample(batch_size)
x1 = tfd.Normal(loc=.25 * tf.square(x2_samples),
scale=tf.ones(batch_size, dtype=tf.float32))
x1_samples = x1.sample()
x_samples = tf.stack([x1_samples, x2_samples], axis=1)
model = NFModel()
optimizer = tf.train.AdamOptimizer(1e-3)
for i in range(10):
start = time.time()
loss, grads = _grad_fn(model, x_samples, None)
optimizer.apply_gradients(grads, global_step=tf.train.get_or_create_global_step())
print('Runtime is %2.5f' % (time.time() - start))
# With
# with tf.device("/gpu:0"):
# ...
#
# Runtime is 0.35395
# Runtime is 0.12711
# Runtime is 0.12438
# Runtime is 0.12428
# Runtime is 0.12572
# Runtime is 0.12593
# Runtime is 0.12505
# Runtime is 0.12527
# Runtime is 0.12418
# Runtime is 0.12340
# With
# with tf.device("/cpu:0"):
# ...
#
# Runtime is 0.10901
# Runtime is 0.10122
# Runtime is 0.10144
# Runtime is 0.10114
# Runtime is 0.10008
# Runtime is 0.10219
# Runtime is 0.10788
# Runtime is 0.10865
# Runtime is 0.10675
# Runtime is 0.10814
#source: https://github.com/ericjang/normalizing-flows-tutorial/blob/master/nf_part1_intro.ipynb
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_probability as tfp
import time
tfd = tfp.distributions
tfb = tfp.bijectors
batch_size=512
DTYPE=tf.float32
NP_DTYPE=np.float32
# quite easy to interpret - multiplying by alpha causes a contraction in volume.
class LeakyReLU(tfb.Bijector):
def __init__(self, alpha=0.5, validate_args=False, name="leaky_relu"):
super(LeakyReLU, self).__init__(
forward_min_event_ndims=1, validate_args=validate_args, name=name)
self.alpha = alpha
def _forward(self, x):
return tf.where(tf.greater_equal(x, 0), x, self.alpha * x)
def _inverse(self, y):
return tf.where(tf.greater_equal(y, 0), y, 1. / self.alpha * y)
def _inverse_log_det_jacobian(self, y):
event_dims = self.forward_min_event_ndims
I = tf.ones_like(y)
J_inv = tf.where(tf.greater_equal(y, 0), I, 1.0 / self.alpha * I)
# abs is actually redundant here, since this det Jacobian is > 0
log_abs_det_J_inv = tf.log(tf.abs(J_inv))
return tf.reduce_sum(log_abs_det_J_inv, axis=event_dims)
d, r = 2, 2
bijectors = []
num_layers = 6
x2_dist = tfd.Normal(loc=0., scale=4.)
x2_samples = x2_dist.sample(batch_size)
x1 = tfd.Normal(loc=.25 * tf.square(x2_samples),
scale=tf.ones(batch_size, dtype=DTYPE))
x1_samples = x1.sample()
x_samples = tf.stack([x1_samples, x2_samples], axis=1)
for i in range(num_layers):
with tf.variable_scope('bijector_%d' % i):
V = tf.get_variable('V', [d, r], dtype=DTYPE) # factor loading
shift = tf.get_variable('shift', [d], dtype=DTYPE) # affine shift
L = tf.get_variable('L', [d * (d + 1) / 2],
dtype=DTYPE) # lower triangular
bijectors.append(tfb.Affine(
scale_tril=tfd.fill_triangular(L),
scale_perturb_factor=V,
shift=shift,
))
alpha = tf.abs(tf.get_variable('alpha', [], dtype=DTYPE)) + .01
bijectors.append(LeakyReLU(alpha=alpha))
base_dist = tfd.MultivariateNormalDiag(loc=tf.zeros([2], DTYPE))
mlp_bijector = tfb.Chain(list(reversed(bijectors[:-1])), name='2d_mlp_bijector')
dist = tfd.TransformedDistribution(
distribution=base_dist,
bijector=mlp_bijector
)
loss = -tf.reduce_mean(dist.log_prob(x_samples))
train_op = tf.train.AdamOptimizer(1e-3).minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(10):
start = time.time()
_, np_loss = sess.run([train_op, loss])
print('Runtime is %2.5f' % (time.time() - start))
# Runtime is 0.81241
# Runtime is 0.00573
# Runtime is 0.00573
# Runtime is 0.00570
# Runtime is 0.00555
# Runtime is 0.00564
# Runtime is 0.00545
# Runtime is 0.00540
# Runtime is 0.00591
# Runtime is 0.00574
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment