IainNZ/mnist_vae.jl Secret

## mnist_vae.jl
#=
Recreates the variational autoencoder visualization experiment from the paper
"Auto-Encoding Variational Bayes" by Kingma and Welling. The paper is available
at https://arxiv.org/abs/1312.6114.
=#

import TensorFlow
const tf = TensorFlow
include("mnist_loader.jl")

# Configuration for experiment.
MNIST_SIZE = 784  # Number of pixels in MNIST images.
BATCH_SIZE = 100  # Number of images in a minibatch.
HIDDEN_SIZE = 100  # Hidden layer size.
LATENT_SIZE = 2  # Size (dimension) of latent representation.

"""
Add a fully-connected linear layer to the graph.

Given an input tensor X of shape [BATCH, INPUT_SIZE], create a linear layer
with weights (shape [INPUT_SIZE, OUTPUT_SIZE]) and biases (shape [OUTPUT_SIZE]),
and return a new tensor Y = X * W + b (shape [BATCH, OUTPUT_SIZE]). Weights are
initialized with standard normals, and biases are initialized to zero.

Args:
    input_tensor: 2-D tensor with shape [BATCH, input_size].
    input_size: Length of second dimension of input_tensor.
    output_size: Length of second dimension of output_tensor.
Returns:
    output_tensor: 2-D tensor with shape [BATCH, output_size], the output of
        the linear layer.
"""
function add_linear_layer(input_tensor, input_size, output_size)
    # TODO: Use shape inference to remove need for input_size.
    weights = tf.Variable(randn(Float32, input_size, output_size))
    biases = tf.Variable(zeros(Float32, output_size))
    return input_tensor * weights + biases
end

"""
Add a Gaussian distribution parameter encoder to the graph.

Args:
    input_tensor: 2-D tensor with shape [BATCH, INPUT_SIZE].
    input_size: Size of second dimension of input_tensor.
    hidden_size: Size of hidden layer.
    latent_size: Size of latent space.
Retuns:
    mu: Means for latent space Gaussian.
    logsigma2: Log variance for latent space Gaussian.
"""
function create_encoder(input_tensor, input_size, hidden_size, latent_size)
    # Create hidden layer (linear with tanh activation).
    hidden = tf.nn.tanh(add_linear_layer(input_tensor, input_size, hidden_size))
    # Gaussian parameters are linear transformations of hidden layer.
    mu = add_linear_layer(hidden, hidden_size, latent_size)
    logsigma2 = add_linear_layer(hidden, hidden_size, latent_size)
    return mu, logsigma2
end

"""
Add a Bernoulli distribution parameter decoder to the graph.

Args:
    latent_tensor: 2-D tensor with shape [BATCH, latent_size].
    latent_size: Size of second dimension of latent_tensor.
    hidden_size: Size of hidden layer.
    output_size: Size of second dimension of output_probs.
Returns:
    output_probs: 2-D of tensor of probabilities, shape [BATCH, output_size].
"""
function create_decoder(latent_tensor, latent_size, hidden_size, output_size)
    # Create hidden layer (linear with tanh activation).
    hidden = tf.nn.tanh(add_linear_layer(latent_tensor, latent_size,
                                         hidden_size))
    # Probabilities are produced from a sigmoid activation to ensure [0,1].
    output_probs = tf.nn.sigmoid(add_linear_layer(hidden, hidden_size,
                                                  output_size))
    return output_probs
end

function vae_experiment()
    # Prepare MNIST images.
    loader = DataLoader()

    # Create an input placeholder.
    input_ph = tf.placeholder(Float32, shape=(BATCH_SIZE, MNIST_SIZE))

    # Construct an encoder that produces Gaussian distribution parameters from
    # the MNIST images.
    mu, logsigma2 = create_encoder(input_ph, MNIST_SIZE, HIDDEN_SIZE,
                                   LATENT_SIZE)

    # TODO: No random distribution ops currently, so we need to manually the
    # draws from a multivariate normal.
    normal_ph = tf.placeholder(Float32, shape=(BATCH_SIZE, LATENT_SIZE))

    # Sample from latent distribution.
    sigma = exp(0.5 * logsigma2)
    sample = mu + sigma .* normal_ph

    # Decode samples to Bernoulli distribution parameters (probabilities).
    output_probs = create_decoder(sample, LATENT_SIZE, HIDDEN_SIZE, MNIST_SIZE)

    # Calculate 'loss' function for the VAE. It is comprised of two parts, and
    # unlike say L2 reconstruction loss in a normal autoencoder, we will be
    # trying to maximize the "evidence lower bound" (ELBO).
    # First part: KL divergence from the prior. This acts as a regularizer.
    kl_divergence = 0.5 * tf.reduce_sum(1 + logsigma2 - mu.^2- exp(logsigma2),
                                        reduction_indices=[2])
    # Second part: Reconstruction loss.
    recon_loss = tf.reduce_sum(input_ph .* log(output_probs) +
                               (1 - input_ph) .* log(1 - output_probs),
                               reduction_indices=[2])
    objective = tf.reduce_mean(kl_divergence + recon_loss)

    # Create optimizer.
    train_step = tf.train.minimize(tf.train.GradientDescentOptimizer(0.01),
                                   -objective)

    # Initialize the weights and biases.
    sess = tf.Session(tf.Graph())
    println("Initializing variables...")
    tf.run(sess, tf.initialize_all_variables())
    println("Rekt")

    # Optimize the parameters of the VAE.
    for iteration in 1:10
        # Sample next batch of MNIST images.
        batch = next_batch(loader, BATCH_SIZE)
        # Sample from a multivariate normal distribution.
        normal_sample = randn(BATCH_SIZE, LATENT_SIZE)
        #cur_objective, _ = tf.run(sess, [objective, train_step],
        #                          Dict(input_ph=>batch[1],
        #                               normal_ph=>normal_sample))
        print("foo")
        cur_objective = tf.run(sess, objective,
                                  Dict(input_ph=>batch[1],
                                       normal_ph=>normal_sample))
        print("bar")
        tf.run(sess, train_step,
                                  Dict(input_ph=>batch[1],
                                       normal_ph=>normal_sample))
        @printf("Iter. %4d, objective: %7.2f\n", iteration, cur_objective)
    end
end

vae_experiment()
	#=
	Recreates the variational autoencoder visualization experiment from the paper
	"Auto-Encoding Variational Bayes" by Kingma and Welling. The paper is available
	at https://arxiv.org/abs/1312.6114.
	=#

	import TensorFlow
	const tf = TensorFlow
	include("mnist_loader.jl")

	# Configuration for experiment.
	MNIST_SIZE = 784 # Number of pixels in MNIST images.
	BATCH_SIZE = 100 # Number of images in a minibatch.
	HIDDEN_SIZE = 100 # Hidden layer size.
	LATENT_SIZE = 2 # Size (dimension) of latent representation.

	"""
	Add a fully-connected linear layer to the graph.

	Given an input tensor X of shape [BATCH, INPUT_SIZE], create a linear layer
	with weights (shape [INPUT_SIZE, OUTPUT_SIZE]) and biases (shape [OUTPUT_SIZE]),
	and return a new tensor Y = X * W + b (shape [BATCH, OUTPUT_SIZE]). Weights are
	initialized with standard normals, and biases are initialized to zero.

	Args:
	input_tensor: 2-D tensor with shape [BATCH, input_size].
	input_size: Length of second dimension of input_tensor.
	output_size: Length of second dimension of output_tensor.
	Returns:
	output_tensor: 2-D tensor with shape [BATCH, output_size], the output of
	the linear layer.
	"""
	function add_linear_layer(input_tensor, input_size, output_size)
	# TODO: Use shape inference to remove need for input_size.
	weights = tf.Variable(randn(Float32, input_size, output_size))
	biases = tf.Variable(zeros(Float32, output_size))
	return input_tensor * weights + biases
	end

	"""
	Add a Gaussian distribution parameter encoder to the graph.

	Args:
	input_tensor: 2-D tensor with shape [BATCH, INPUT_SIZE].
	input_size: Size of second dimension of input_tensor.
	hidden_size: Size of hidden layer.
	latent_size: Size of latent space.
	Retuns:
	mu: Means for latent space Gaussian.
	logsigma2: Log variance for latent space Gaussian.
	"""
	function create_encoder(input_tensor, input_size, hidden_size, latent_size)
	# Create hidden layer (linear with tanh activation).
	hidden = tf.nn.tanh(add_linear_layer(input_tensor, input_size, hidden_size))
	# Gaussian parameters are linear transformations of hidden layer.
	mu = add_linear_layer(hidden, hidden_size, latent_size)
	logsigma2 = add_linear_layer(hidden, hidden_size, latent_size)
	return mu, logsigma2
	end

	"""
	Add a Bernoulli distribution parameter decoder to the graph.

	Args:
	latent_tensor: 2-D tensor with shape [BATCH, latent_size].
	latent_size: Size of second dimension of latent_tensor.
	hidden_size: Size of hidden layer.
	output_size: Size of second dimension of output_probs.
	Returns:
	output_probs: 2-D of tensor of probabilities, shape [BATCH, output_size].
	"""
	function create_decoder(latent_tensor, latent_size, hidden_size, output_size)
	# Create hidden layer (linear with tanh activation).
	hidden = tf.nn.tanh(add_linear_layer(latent_tensor, latent_size,
	hidden_size))
	# Probabilities are produced from a sigmoid activation to ensure [0,1].
	output_probs = tf.nn.sigmoid(add_linear_layer(hidden, hidden_size,
	output_size))
	return output_probs
	end

	function vae_experiment()
	# Prepare MNIST images.
	loader = DataLoader()

	# Create an input placeholder.
	input_ph = tf.placeholder(Float32, shape=(BATCH_SIZE, MNIST_SIZE))

	# Construct an encoder that produces Gaussian distribution parameters from
	# the MNIST images.
	mu, logsigma2 = create_encoder(input_ph, MNIST_SIZE, HIDDEN_SIZE,
	LATENT_SIZE)

	# TODO: No random distribution ops currently, so we need to manually the
	# draws from a multivariate normal.
	normal_ph = tf.placeholder(Float32, shape=(BATCH_SIZE, LATENT_SIZE))

	# Sample from latent distribution.
	sigma = exp(0.5 * logsigma2)
	sample = mu + sigma .* normal_ph

	# Decode samples to Bernoulli distribution parameters (probabilities).
	output_probs = create_decoder(sample, LATENT_SIZE, HIDDEN_SIZE, MNIST_SIZE)

	# Calculate 'loss' function for the VAE. It is comprised of two parts, and
	# unlike say L2 reconstruction loss in a normal autoencoder, we will be
	# trying to maximize the "evidence lower bound" (ELBO).
	# First part: KL divergence from the prior. This acts as a regularizer.
	kl_divergence = 0.5 * tf.reduce_sum(1 + logsigma2 - mu.^2- exp(logsigma2),
	reduction_indices=[2])
	# Second part: Reconstruction loss.
	recon_loss = tf.reduce_sum(input_ph .* log(output_probs) +
	(1 - input_ph) .* log(1 - output_probs),
	reduction_indices=[2])
	objective = tf.reduce_mean(kl_divergence + recon_loss)

	# Create optimizer.
	train_step = tf.train.minimize(tf.train.GradientDescentOptimizer(0.01),
	-objective)

	# Initialize the weights and biases.
	sess = tf.Session(tf.Graph())
	println("Initializing variables...")
	tf.run(sess, tf.initialize_all_variables())
	println("Rekt")

	# Optimize the parameters of the VAE.
	for iteration in 1:10
	# Sample next batch of MNIST images.
	batch = next_batch(loader, BATCH_SIZE)
	# Sample from a multivariate normal distribution.
	normal_sample = randn(BATCH_SIZE, LATENT_SIZE)
	#cur_objective, _ = tf.run(sess, [objective, train_step],
	# Dict(input_ph=>batch[1],
	# normal_ph=>normal_sample))
	print("foo")
	cur_objective = tf.run(sess, objective,
	Dict(input_ph=>batch[1],
	normal_ph=>normal_sample))
	print("bar")
	tf.run(sess, train_step,
	Dict(input_ph=>batch[1],
	normal_ph=>normal_sample))
	@printf("Iter. %4d, objective: %7.2f\n", iteration, cur_objective)
	end
	end

	vae_experiment()