Created
October 26, 2018 09:01
-
-
Save lhlmgr/f6709e5aba4a5314b5221d58232b09bd to your computer and use it in GitHub Desktop.
Graph-Mode vs. Eager-Mode comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#source: https://github.com/ericjang/normalizing-flows-tutorial/blob/master/nf_part1_intro.ipynb | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import tensorflow as tf | |
import tensorflow_probability as tfp | |
import tensorflow.contrib.eager as tfe | |
import time | |
tf.enable_eager_execution() | |
tfd = tfp.distributions | |
tfb = tfp.bijectors | |
# quite easy to interpret - multiplying by alpha causes a contraction in volume. | |
class LeakyReLU(tfb.Bijector): | |
def __init__(self, alpha=0.5, validate_args=False, name="leaky_relu"): | |
super(LeakyReLU, self).__init__(forward_min_event_ndims=1, inverse_min_event_ndims=1, | |
validate_args=validate_args, name=name) | |
self.alpha = alpha | |
def _forward(self, x): | |
return tf.where(tf.greater_equal(x, 0), x, self.alpha * x) | |
def _inverse(self, y): | |
return tf.where(tf.greater_equal(y, 0), y, 1. / self.alpha * y) | |
def _inverse_log_det_jacobian(self, y): | |
event_dims = self.inverse_min_event_ndims | |
I = tf.ones_like(y) | |
J_inv = tf.where(tf.greater_equal(y, 0), I, 1.0 / self.alpha * I) | |
# abs is actually redundant here, since this det Jacobian is > 0 | |
log_abs_det_J_inv = tf.log(tf.abs(J_inv)) | |
return tf.reduce_sum(log_abs_det_J_inv, axis=event_dims) | |
batch_size=512 | |
DTYPE = tf.float32 | |
class NFModel(tf.keras.Model): | |
def __init__(self, *args, **kwargs): | |
super().__init__(*args, **kwargs) | |
def call(self, *args, **kwargs): | |
num_layers = 6 | |
d, r = 2, 2 | |
bijectors = [] | |
for i in range(num_layers): | |
with tf.variable_scope('bijector_%d' % i): | |
V = tf.get_variable('V', [d, r], dtype=DTYPE) # factor loading | |
shift = tf.get_variable('shift', [d], dtype=DTYPE) # affine shift | |
L = tf.get_variable('L', [d * (d + 1) / 2], dtype=DTYPE) # lower triangular | |
bijectors.append(tfb.Affine( | |
scale_tril=tfd.fill_triangular(L), | |
scale_perturb_factor=V, | |
shift=shift, | |
)) | |
alpha = tf.get_variable('alpha', [], dtype=DTYPE) | |
abs_alpha = tf.abs(alpha) + .01 | |
bijectors.append(LeakyReLU(alpha=abs_alpha)) | |
base_dist = tfd.MultivariateNormalDiag(loc=tf.zeros([2], DTYPE)) | |
mlp_bijector = tfb.Chain(list(reversed(bijectors[:-1])), name='2d_mlp_bijector') | |
dist = tfd.TransformedDistribution(distribution=base_dist, bijector=mlp_bijector) | |
return {"dist": dist} | |
def loss_fn(model, inputs, targets): | |
result_dict = model(inputs) | |
loss = -tf.reduce_mean(result_dict["dist"].log_prob(x_samples)) | |
return loss | |
def _grad_fn(model, inputs, targets): | |
with tf.GradientTape() as tape: | |
gradients = tfe.implicit_value_and_gradients(loss_fn) | |
return gradients(model, inputs, targets) | |
NUM_STEPS = int(1e5) | |
global_step = [] | |
np_losses = [] | |
with tf.device("/gpu:0"): | |
x2_dist = tfd.Normal(loc=0., scale=4.) | |
x2_samples = x2_dist.sample(batch_size) | |
x1 = tfd.Normal(loc=.25 * tf.square(x2_samples), | |
scale=tf.ones(batch_size, dtype=tf.float32)) | |
x1_samples = x1.sample() | |
x_samples = tf.stack([x1_samples, x2_samples], axis=1) | |
model = NFModel() | |
optimizer = tf.train.AdamOptimizer(1e-3) | |
for i in range(10): | |
start = time.time() | |
loss, grads = _grad_fn(model, x_samples, None) | |
optimizer.apply_gradients(grads, global_step=tf.train.get_or_create_global_step()) | |
print('Runtime is %2.5f' % (time.time() - start)) | |
# With | |
# with tf.device("/gpu:0"): | |
# ... | |
# | |
# Runtime is 0.35395 | |
# Runtime is 0.12711 | |
# Runtime is 0.12438 | |
# Runtime is 0.12428 | |
# Runtime is 0.12572 | |
# Runtime is 0.12593 | |
# Runtime is 0.12505 | |
# Runtime is 0.12527 | |
# Runtime is 0.12418 | |
# Runtime is 0.12340 | |
# With | |
# with tf.device("/cpu:0"): | |
# ... | |
# | |
# Runtime is 0.10901 | |
# Runtime is 0.10122 | |
# Runtime is 0.10144 | |
# Runtime is 0.10114 | |
# Runtime is 0.10008 | |
# Runtime is 0.10219 | |
# Runtime is 0.10788 | |
# Runtime is 0.10865 | |
# Runtime is 0.10675 | |
# Runtime is 0.10814 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#source: https://github.com/ericjang/normalizing-flows-tutorial/blob/master/nf_part1_intro.ipynb | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import tensorflow as tf | |
import tensorflow_probability as tfp | |
import time | |
tfd = tfp.distributions | |
tfb = tfp.bijectors | |
batch_size=512 | |
DTYPE=tf.float32 | |
NP_DTYPE=np.float32 | |
# quite easy to interpret - multiplying by alpha causes a contraction in volume. | |
class LeakyReLU(tfb.Bijector): | |
def __init__(self, alpha=0.5, validate_args=False, name="leaky_relu"): | |
super(LeakyReLU, self).__init__( | |
forward_min_event_ndims=1, validate_args=validate_args, name=name) | |
self.alpha = alpha | |
def _forward(self, x): | |
return tf.where(tf.greater_equal(x, 0), x, self.alpha * x) | |
def _inverse(self, y): | |
return tf.where(tf.greater_equal(y, 0), y, 1. / self.alpha * y) | |
def _inverse_log_det_jacobian(self, y): | |
event_dims = self.forward_min_event_ndims | |
I = tf.ones_like(y) | |
J_inv = tf.where(tf.greater_equal(y, 0), I, 1.0 / self.alpha * I) | |
# abs is actually redundant here, since this det Jacobian is > 0 | |
log_abs_det_J_inv = tf.log(tf.abs(J_inv)) | |
return tf.reduce_sum(log_abs_det_J_inv, axis=event_dims) | |
d, r = 2, 2 | |
bijectors = [] | |
num_layers = 6 | |
x2_dist = tfd.Normal(loc=0., scale=4.) | |
x2_samples = x2_dist.sample(batch_size) | |
x1 = tfd.Normal(loc=.25 * tf.square(x2_samples), | |
scale=tf.ones(batch_size, dtype=DTYPE)) | |
x1_samples = x1.sample() | |
x_samples = tf.stack([x1_samples, x2_samples], axis=1) | |
for i in range(num_layers): | |
with tf.variable_scope('bijector_%d' % i): | |
V = tf.get_variable('V', [d, r], dtype=DTYPE) # factor loading | |
shift = tf.get_variable('shift', [d], dtype=DTYPE) # affine shift | |
L = tf.get_variable('L', [d * (d + 1) / 2], | |
dtype=DTYPE) # lower triangular | |
bijectors.append(tfb.Affine( | |
scale_tril=tfd.fill_triangular(L), | |
scale_perturb_factor=V, | |
shift=shift, | |
)) | |
alpha = tf.abs(tf.get_variable('alpha', [], dtype=DTYPE)) + .01 | |
bijectors.append(LeakyReLU(alpha=alpha)) | |
base_dist = tfd.MultivariateNormalDiag(loc=tf.zeros([2], DTYPE)) | |
mlp_bijector = tfb.Chain(list(reversed(bijectors[:-1])), name='2d_mlp_bijector') | |
dist = tfd.TransformedDistribution( | |
distribution=base_dist, | |
bijector=mlp_bijector | |
) | |
loss = -tf.reduce_mean(dist.log_prob(x_samples)) | |
train_op = tf.train.AdamOptimizer(1e-3).minimize(loss) | |
with tf.Session() as sess: | |
sess.run(tf.global_variables_initializer()) | |
for i in range(10): | |
start = time.time() | |
_, np_loss = sess.run([train_op, loss]) | |
print('Runtime is %2.5f' % (time.time() - start)) | |
# Runtime is 0.81241 | |
# Runtime is 0.00573 | |
# Runtime is 0.00573 | |
# Runtime is 0.00570 | |
# Runtime is 0.00555 | |
# Runtime is 0.00564 | |
# Runtime is 0.00545 | |
# Runtime is 0.00540 | |
# Runtime is 0.00591 | |
# Runtime is 0.00574 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment