lhlmgr/norm_flow_eager.py

## norm_flow_eager.py
#source: https://github.com/ericjang/normalizing-flows-tutorial/blob/master/nf_part1_intro.ipynb

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.contrib.eager as tfe
import time

tf.enable_eager_execution()
tfd = tfp.distributions
tfb = tfp.bijectors

# quite easy to interpret - multiplying by alpha causes a contraction in volume.
class LeakyReLU(tfb.Bijector):
    def __init__(self, alpha=0.5, validate_args=False, name="leaky_relu"):
        super(LeakyReLU, self).__init__(forward_min_event_ndims=1, inverse_min_event_ndims=1,
                                        validate_args=validate_args, name=name)
        self.alpha = alpha

    def _forward(self, x):
        return tf.where(tf.greater_equal(x, 0), x, self.alpha * x)

    def _inverse(self, y):
        return tf.where(tf.greater_equal(y, 0), y, 1. / self.alpha * y)

    def _inverse_log_det_jacobian(self, y):
        event_dims = self.inverse_min_event_ndims
        I = tf.ones_like(y)
        J_inv = tf.where(tf.greater_equal(y, 0), I, 1.0 / self.alpha * I)
        # abs is actually redundant here, since this det Jacobian is > 0
        log_abs_det_J_inv = tf.log(tf.abs(J_inv))
        return tf.reduce_sum(log_abs_det_J_inv, axis=event_dims)

batch_size=512
DTYPE = tf.float32

class NFModel(tf.keras.Model):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def call(self, *args, **kwargs):
    num_layers = 6
    d, r = 2, 2
    bijectors = []

    for i in range(num_layers):
      with tf.variable_scope('bijector_%d' % i):
        V = tf.get_variable('V', [d, r], dtype=DTYPE)  # factor loading
        shift = tf.get_variable('shift', [d], dtype=DTYPE)  # affine shift
        L = tf.get_variable('L', [d * (d + 1) / 2], dtype=DTYPE)  # lower triangular
        bijectors.append(tfb.Affine(
          scale_tril=tfd.fill_triangular(L),
          scale_perturb_factor=V,
          shift=shift,
        ))

        alpha = tf.get_variable('alpha', [], dtype=DTYPE)
        abs_alpha = tf.abs(alpha) + .01
        bijectors.append(LeakyReLU(alpha=abs_alpha))

    base_dist = tfd.MultivariateNormalDiag(loc=tf.zeros([2], DTYPE))
    mlp_bijector = tfb.Chain(list(reversed(bijectors[:-1])), name='2d_mlp_bijector')
    dist = tfd.TransformedDistribution(distribution=base_dist, bijector=mlp_bijector)

    return {"dist": dist}

def loss_fn(model, inputs, targets):
  result_dict = model(inputs)
  loss = -tf.reduce_mean(result_dict["dist"].log_prob(x_samples))
  return loss

def _grad_fn(model, inputs, targets):
  with tf.GradientTape() as tape:
    gradients = tfe.implicit_value_and_gradients(loss_fn)
  return gradients(model, inputs, targets)

NUM_STEPS = int(1e5)
global_step = []
np_losses = []

with tf.device("/gpu:0"):
  x2_dist = tfd.Normal(loc=0., scale=4.)
  x2_samples = x2_dist.sample(batch_size)
  x1 = tfd.Normal(loc=.25 * tf.square(x2_samples),
                  scale=tf.ones(batch_size, dtype=tf.float32))
  x1_samples = x1.sample()
  x_samples = tf.stack([x1_samples, x2_samples], axis=1)

  model = NFModel()
  optimizer = tf.train.AdamOptimizer(1e-3)

  for i in range(10):
    start = time.time()
    loss, grads = _grad_fn(model, x_samples, None)
    optimizer.apply_gradients(grads, global_step=tf.train.get_or_create_global_step())
    print('Runtime is %2.5f' % (time.time() - start))

# With
#  with tf.device("/gpu:0"):
#    ...
#
# Runtime is 0.35395
# Runtime is 0.12711
# Runtime is 0.12438
# Runtime is 0.12428
# Runtime is 0.12572
# Runtime is 0.12593
# Runtime is 0.12505
# Runtime is 0.12527
# Runtime is 0.12418
# Runtime is 0.12340


# With
# with tf.device("/cpu:0"):
#   ...
#
# Runtime is 0.10901
# Runtime is 0.10122
# Runtime is 0.10144
# Runtime is 0.10114
# Runtime is 0.10008
# Runtime is 0.10219
# Runtime is 0.10788
# Runtime is 0.10865
# Runtime is 0.10675
# Runtime is 0.10814

## norm_flow_graph.py
#source: https://github.com/ericjang/normalizing-flows-tutorial/blob/master/nf_part1_intro.ipynb

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_probability as tfp
import time

tfd = tfp.distributions
tfb = tfp.bijectors

batch_size=512
DTYPE=tf.float32
NP_DTYPE=np.float32

# quite easy to interpret - multiplying by alpha causes a contraction in volume.
class LeakyReLU(tfb.Bijector):
  def __init__(self, alpha=0.5, validate_args=False, name="leaky_relu"):
    super(LeakyReLU, self).__init__(
        forward_min_event_ndims=1, validate_args=validate_args, name=name)
    self.alpha = alpha

  def _forward(self, x):
    return tf.where(tf.greater_equal(x, 0), x, self.alpha * x)

  def _inverse(self, y):
    return tf.where(tf.greater_equal(y, 0), y, 1. / self.alpha * y)

  def _inverse_log_det_jacobian(self, y):
    event_dims = self.forward_min_event_ndims
    I = tf.ones_like(y)
    J_inv = tf.where(tf.greater_equal(y, 0), I, 1.0 / self.alpha * I)
    # abs is actually redundant here, since this det Jacobian is > 0
    log_abs_det_J_inv = tf.log(tf.abs(J_inv))
    return tf.reduce_sum(log_abs_det_J_inv, axis=event_dims)

d, r = 2, 2
bijectors = []
num_layers = 6

x2_dist = tfd.Normal(loc=0., scale=4.)
x2_samples = x2_dist.sample(batch_size)
x1 = tfd.Normal(loc=.25 * tf.square(x2_samples),
                scale=tf.ones(batch_size, dtype=DTYPE))
x1_samples = x1.sample()
x_samples = tf.stack([x1_samples, x2_samples], axis=1)

for i in range(num_layers):
  with tf.variable_scope('bijector_%d' % i):
    V = tf.get_variable('V', [d, r], dtype=DTYPE)  # factor loading
    shift = tf.get_variable('shift', [d], dtype=DTYPE)  # affine shift
    L = tf.get_variable('L', [d * (d + 1) / 2],
                        dtype=DTYPE)  # lower triangular
    bijectors.append(tfb.Affine(
        scale_tril=tfd.fill_triangular(L),
        scale_perturb_factor=V,
        shift=shift,
    ))
    alpha = tf.abs(tf.get_variable('alpha', [], dtype=DTYPE)) + .01
    bijectors.append(LeakyReLU(alpha=alpha))

base_dist = tfd.MultivariateNormalDiag(loc=tf.zeros([2], DTYPE))
mlp_bijector = tfb.Chain(list(reversed(bijectors[:-1])), name='2d_mlp_bijector')
dist = tfd.TransformedDistribution(
  distribution=base_dist,
  bijector=mlp_bijector
)

loss = -tf.reduce_mean(dist.log_prob(x_samples))
train_op = tf.train.AdamOptimizer(1e-3).minimize(loss)
with tf.Session() as sess:
  sess.run(tf.global_variables_initializer())
  for i in range(10):
    start = time.time()
    _, np_loss = sess.run([train_op, loss])
    print('Runtime is %2.5f' % (time.time() - start))

# Runtime is 0.81241
# Runtime is 0.00573
# Runtime is 0.00573
# Runtime is 0.00570
# Runtime is 0.00555
# Runtime is 0.00564
# Runtime is 0.00545
# Runtime is 0.00540
# Runtime is 0.00591
# Runtime is 0.00574
	#source: https://github.com/ericjang/normalizing-flows-tutorial/blob/master/nf_part1_intro.ipynb

	import numpy as np
	import matplotlib.pyplot as plt
	import tensorflow as tf
	import tensorflow_probability as tfp
	import tensorflow.contrib.eager as tfe
	import time

	tf.enable_eager_execution()
	tfd = tfp.distributions
	tfb = tfp.bijectors

	# quite easy to interpret - multiplying by alpha causes a contraction in volume.
	class LeakyReLU(tfb.Bijector):
	def __init__(self, alpha=0.5, validate_args=False, name="leaky_relu"):
	super(LeakyReLU, self).__init__(forward_min_event_ndims=1, inverse_min_event_ndims=1,
	validate_args=validate_args, name=name)
	self.alpha = alpha

	def _forward(self, x):
	return tf.where(tf.greater_equal(x, 0), x, self.alpha * x)

	def _inverse(self, y):
	return tf.where(tf.greater_equal(y, 0), y, 1. / self.alpha * y)

	def _inverse_log_det_jacobian(self, y):
	event_dims = self.inverse_min_event_ndims
	I = tf.ones_like(y)
	J_inv = tf.where(tf.greater_equal(y, 0), I, 1.0 / self.alpha * I)
	# abs is actually redundant here, since this det Jacobian is > 0
	log_abs_det_J_inv = tf.log(tf.abs(J_inv))
	return tf.reduce_sum(log_abs_det_J_inv, axis=event_dims)

	batch_size=512
	DTYPE = tf.float32

	class NFModel(tf.keras.Model):
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	def call(self, args, *kwargs):
	num_layers = 6
	d, r = 2, 2
	bijectors = []

	for i in range(num_layers):
	with tf.variable_scope('bijector_%d' % i):
	V = tf.get_variable('V', [d, r], dtype=DTYPE) # factor loading
	shift = tf.get_variable('shift', [d], dtype=DTYPE) # affine shift
	L = tf.get_variable('L', [d * (d + 1) / 2], dtype=DTYPE) # lower triangular
	bijectors.append(tfb.Affine(
	scale_tril=tfd.fill_triangular(L),
	scale_perturb_factor=V,
	shift=shift,
	))

	alpha = tf.get_variable('alpha', [], dtype=DTYPE)
	abs_alpha = tf.abs(alpha) + .01
	bijectors.append(LeakyReLU(alpha=abs_alpha))

	base_dist = tfd.MultivariateNormalDiag(loc=tf.zeros([2], DTYPE))
	mlp_bijector = tfb.Chain(list(reversed(bijectors[:-1])), name='2d_mlp_bijector')
	dist = tfd.TransformedDistribution(distribution=base_dist, bijector=mlp_bijector)

	return {"dist": dist}

	def loss_fn(model, inputs, targets):
	result_dict = model(inputs)
	loss = -tf.reduce_mean(result_dict["dist"].log_prob(x_samples))
	return loss

	def _grad_fn(model, inputs, targets):
	with tf.GradientTape() as tape:
	gradients = tfe.implicit_value_and_gradients(loss_fn)
	return gradients(model, inputs, targets)

	NUM_STEPS = int(1e5)
	global_step = []
	np_losses = []

	with tf.device("/gpu:0"):
	x2_dist = tfd.Normal(loc=0., scale=4.)
	x2_samples = x2_dist.sample(batch_size)
	x1 = tfd.Normal(loc=.25 * tf.square(x2_samples),
	scale=tf.ones(batch_size, dtype=tf.float32))
	x1_samples = x1.sample()
	x_samples = tf.stack([x1_samples, x2_samples], axis=1)

	model = NFModel()
	optimizer = tf.train.AdamOptimizer(1e-3)

	for i in range(10):
	start = time.time()
	loss, grads = _grad_fn(model, x_samples, None)
	optimizer.apply_gradients(grads, global_step=tf.train.get_or_create_global_step())
	print('Runtime is %2.5f' % (time.time() - start))

	# With
	# with tf.device("/gpu:0"):
	# ...
	#
	# Runtime is 0.35395
	# Runtime is 0.12711
	# Runtime is 0.12438
	# Runtime is 0.12428
	# Runtime is 0.12572
	# Runtime is 0.12593
	# Runtime is 0.12505
	# Runtime is 0.12527
	# Runtime is 0.12418
	# Runtime is 0.12340


	# With
	# with tf.device("/cpu:0"):
	# ...
	#
	# Runtime is 0.10901
	# Runtime is 0.10122
	# Runtime is 0.10144
	# Runtime is 0.10114
	# Runtime is 0.10008
	# Runtime is 0.10219
	# Runtime is 0.10788
	# Runtime is 0.10865
	# Runtime is 0.10675
	# Runtime is 0.10814