botev/run_yellow_fin_vae.py

## run_yellow_fin_vae.py
import numpy as np
import argparse

import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.gradient import zero_grad


import lasagne.layers as L
import lasagne.nonlinearities as nl
from lasagne.updates import adam
from lasagne.random import get_rng


def run(optimizer, epochs=100, batch_size=250, seed=None):
    if seed is not None:
        np.random.seed(seed)

    dataset = Mnist()
    dataset.load()

    # Build the training function:
    f = build_train_func(optimizer, [784, 512, 512, 64])
    num_batches = int(np.ceil(50000.0 / float(batch_size)))
    results = np.zeros((num_batches * epochs, 1))
    i = 0
    for e in range(epochs):
        for x, _ in dataset.iter("train", batch_size):
            # Sample binarised version
            x = (np.random.rand(*x.shape) < x).astype(theano.config.floatX)
            results[i, 0] = f(x)
            print("[{}]Loss:".format(i), results[i, 0])
            i += 1
    np.savetxt("results.csv", results, delimiter=",")
    print("Done")


def build_train_func(optimizer, architecture):
    # Input
    l_in = L.InputLayer((None, architecture[0]), name="x")
    # x
    layer = l_in

    # Encoder x -> z
    for i, a in enumerate(architecture[1:-1]):
        name = "encode_{}".format(i + 1)
        layer = L.DenseLayer(layer, a, nonlinearity=nl.tanh, name=name)
    name = "encode_{}".format(len(architecture) - 1)
    layer = L.DenseLayer(layer, 2 * architecture[-1],
                         nonlinearity=nl.identity,
                         name=name)
    # Q(z|x)
    l_q = GaussianParameters(layer, name="q")
    # Samples from Q
    layer = GaussianSampler(l_q, name="q_sample")
    # Decoder z -> x
    for i, a in reversed(list(enumerate(architecture[1:-1]))):
        name = "decode_{}".format(i + 1)
        layer = L.DenseLayer(layer, a, nonlinearity=nl.tanh, name=name)
    # P(x|z)
    layer = L.DenseLayer(layer, architecture[0],
                         nonlinearity=nl.sigmoid,
                         name="p")
    q_params, prediction = L.get_output((l_q, layer))

    log_p_x = T.nnet.binary_crossentropy(prediction, l_in.input_var)
    log_p_x = T.sum(log_p_x, axis=1)

    kl = guassian_kl(q_params)
    neg_elbo = T.mean(log_p_x + kl)

    params = L.get_all_params(layer, trainable=True)
    if optimizer == "yellow_fin":
        updates = yellow_fin(neg_elbo, params)
    elif optimizer == "adam":
        updates = adam(neg_elbo, params, 0.001)
    else:
        raise NotImplementedError
    return theano.function([l_in.input_var], neg_elbo, updates=updates)


class GaussianParameters(L.Layer):
    def __init__(self, incoming, nonlinearity=nl.softplus,
                 eps=1e-6, **kwargs):
        super(GaussianParameters, self).__init__(incoming, **kwargs)
        self.nonlinearity = nonlinearity
        self.eps = eps

    @staticmethod
    def get_raw(distribution):
        d = T.int_div(distribution.shape[1], 2)
        mu = distribution[:, :d]
        pre_sigma = distribution[:, d:]
        return mu, pre_sigma

    def get_output_for(self, inputs, **kwargs):
        mu, pre_sigma = GaussianParameters.get_raw(inputs)
        sigma = self.nonlinearity(pre_sigma) + self.eps
        return T.concatenate((mu, sigma), axis=1)


class GaussianSampler(L.Layer):
    def __init__(self, incoming,
                 num_samples=1,
                 **kwargs):
        super(GaussianSampler, self).__init__(incoming, **kwargs)
        self.num_samples = num_samples

    def get_output_shape_for(self, input_shape):
        shape = input_shape
        assert shape[1] % 2 == 0
        out_shape = (shape[0] * self.num_samples if shape[0] is not None else None, shape[1] // 2)
        return out_shape

    def get_output_for(self, inputs, **kwargs):
        mu, sigma = GaussianParameters.get_raw(inputs)
        n, d = mu.shape
        epsilon = normal((n, self.num_samples, d))
        samples = mu.dimshuffle(0, 'x', 1) + sigma.dimshuffle(0, 'x', 1) * epsilon
        samples = T.reshape(samples, (n * self.num_samples, d))
        return samples


def guassian_kl(q_params):
    q_mu, q_sigma = GaussianParameters.get_raw(q_params)
    # Trace
    kl = T.sqr(q_sigma)
    # Means difference
    kl += T.sqr(q_mu - 0)
    # Both times inverse of sigma p
    kl /= T.sqr(1)
    # Minus 1
    kl -= T.constant(1)
    # Log determinant of p_sigma
    kl += 2 * T.log(1)
    # Log determinant of q_sigma
    kl -= 2 * T.log(q_sigma)
    # Flatten
    kl = T.flatten(kl, outdim=2)
    # Calculate per data point
    kl = 0.5 * T.sum(kl, axis=1)
    return kl

def normal(shape):
    random = random or RandomStreams(get_rng().randint(1, 2147462579))
    samples = random.normal(shape, dtype=theano.config.floatX)
    samples = zero_grad(samples)
    return samples

class Mnist(object):
    """
    The Mixed National Institute of Standards and Technology  Dataset
    """
    download_url = "http://deeplearning.net/data/mnist/mnist.pkl.gz"

    def __init__(self,  flatten=True, path=None):
        self.flatten = flatten
        self.path = "./mnist"
        self.data = dict()

    def iter(self, partition, batch_size, shuffle=True):
        n = self.data[partition][1].shape[0]
        index = np.arange(n)
        if shuffle:
            np.random.shuffle(index)
        for i in range(0, n, batch_size):
            yield (x[index[i: i + batch_size]] for x in self.data[partition])

    def load(self, verbose=True):
        c = 0
        while not self.verify_download():
            self.download()
            c += 1
            if c >= 5:
                raise ValueError("Could not verify download after 5 attempts.")

        file_name = os.path.join(self.path, "mnist.pkl.gz")
        with gzip.open(file_name, 'rb') as f:
            data = load(f, encoding='latin1')
            if self.flatten:
                self.data["train"] = data[0]
                self.data["valid"] = data[1]
                self.data["test"] = data[2]
            else:
                self.data["train"] = [data[0][0].reshape((-1, 28, 28)), data[0][1]]
                self.data["valid"] = [data[1][0].reshape((-1, 28, 28)), data[1][1]]
                self.data["test"] = [data[2][0].reshape((-1, 28, 28)), data[2][1]]

        if verbose:
            print("Train - Images:", self.data["train"][0].shape, "Targets:", self.data["train"][1].shape)
            print("Valid - Images:", self.data["valid"][0].shape, "Targets:", self.data["valid"][1].shape)
            print("Test - Images:", self.data["test"][0].shape, "Targets:", self.data["test"][1].shape)

    def download(self):
        """
        Downloads the data set from the website.
        :return:
        """
        if not os.path.exists(self.path):
            os.makedirs(self.path)
        file_path = os.path.join(self.path, "mnist.pkl.gz")
        general_utils.download_file(Mnist.download_url, file_path)

    def verify_download(self):
        """
        Verifies that the dataset has been downloaded and is present.
        :return:
        """
        return os.path.exists(os.path.join(self.path, "mnist.pkl.gz"))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("optimizer", type=str)
    parser.add_argument("--seed", type=int, default=None)
    parser.add_argument("--epochs", type=int, default=100)
    parser.add_argument("--batch_size", type=int, default=250)
    run(**vars(parser.parse_args()))

## verify_cubic_solver.py
import numpy as np
import theano
import theano.tensor as T
from theano.printing import Print
fx = theano.config.floatX


def print_values(var, msg):
    """
    Makes an op to print the values of the variable with the message
    """
    return Print(msg)(var)


def solve(c, d, h_min, debug=False):
    # We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
    # where x = sqrt(mu)
    # Minimising this reduces to solving
    # y^3 + p * y + p = 0
    # y = x - 1
    # p = (D^2 h_min^2)/(2 * C)
    p = (T.sqr(d) * T.sqr(h_min)) / (T.constant(2) * c)
    w3 = p * (T.sqrt(0.25 + p / 27.0) - 0.5)
    w = T.power(w3, 1.0 / 3.0)
    y = w - p / (3 * w)
    sqrt_mu = y + 1
    if debug:
        value = print_values(y*y*y + p*y + p, "derivative_value")
        sqrt_mu += 1e-20 * value
    return sqrt_mu


def solve_np(c, d, h_min):
    p = (np.square(d) * np.square(h_min)) / np.asarray(2 * c, dtype=fx)
    coeffs = np.asarray([-1.0, 3.0, - 3 - p, 1.0], dtype=fx)
    return np.roots(coeffs)

if __name__ == "__main__":
    N = 20
    D = 5
    c = (np.random.rand(N) * D).astype(fx)
    d = (np.random.rand(N) * D).astype(fx)
    h_min = (np.random.rand(N) * D).astype(fx)
    for i in range(N):
        x1 = solve(c[i], d[i], h_min[i]).eval().item()
        x2 = solve_np(c[i], d[i], h_min[i])
        ind = np.real(x2) == np.sqrt(x2 * np.conj(x2))
        x2 = np.real(x2[ind])[0]
        print("{:.10f} - {:.10f} = {:.10f}".format(x1, x2, (x1 - x2)))

## yellow_fin.py
import numpy as np
import theano
import theano.tensor as T
from theano.printing import Print
from collections import OrderedDict


def yellow_fin(loss, params, beta=0.999,
               learning_rate_init=1.0, momentum_init=0.0,
               learning_rate_factor=1.0, t=None,
               window_width=20, debug=False):
    """
    The Yellow Fin algorithm.
    :param loss: Theano expression of the loss
    :param params: List of shared variables
    :param beta: The moving average smoothing variable
    :param learning_rate_init: initial learning rate
    :param momentum_init: initial momentum
    :padam learning_rate_factor: An additionaly multiplication factor applied to the base learning_rate.
    :param t: optionally pass your own time variable
    :param window_width: width of the window for calculating h_max and h_min
    :param debug: flag for debugging
    :return:
    """
    grads = T.grad(loss, params)
    updates = OrderedDict()

    alpha = theano.shared(value_floatX(learning_rate_init), name="learning_rate")
    mu = theano.shared(value_floatX(momentum_init), name="momentum")
    if t is None:
        t = theano.shared(np.asarray(0).astype(np.int32), name="t")
        updates[t] = t + 1

    # Fetch variables from the routines
    h_max, h_min = curvature_range(grads, beta, t, updates, window_width)
    c = gradient_variance(grads, params, beta, updates)
    d = distance_to_optim(grads, beta, updates)
    if debug:
        h_max = print_values(h_max, "h_max")
        h_min = print_values(h_min, "h_min")
        c = print_values(c, "c")
        d = print_values(d, "d")
    # Get the solution to the minimisation problem for mu
    sqrt_mu1 = solve(c, d, h_min)
    if debug:
        sqrt_mu1 = print_values(sqrt_mu1, "sqrt_mu1")
    sqrt_mu2 = (T.sqrt(h_max) - T.sqrt(h_min)) / (T.sqrt(h_max) + T.sqrt(h_min))
    sqrt_mu = T.maximum(sqrt_mu1, sqrt_mu2)
    # Given the solution the final mu_t and alpha_t
    alpha_t = T.sqr(1 - sqrt_mu) / h_min
    mu_t = T.sqr(sqrt_mu)
    if debug:
        mu_t = print_values(mu_t, "mu_t")
        alpha_t = print_values(alpha_t, "alpha_t")
    # Update moving averages
    updates[mu] = ema(beta, mu, mu_t)
    updates[alpha] = ema(beta, alpha, alpha_t)
    if debug:
        updates[mu] = print_values(updates[mu], "mu")
        updates[alpha] = print_values(updates[alpha], "alpha")
    # Apply momentum
    momentum(grads, params, learning_rate_factor * updates[alpha], updates[mu], updates)
    return updates


def curvature_range(grads, beta, t, updates, window_width=20, debug=False):
    """
    Routine for calculating the h_max and h_min curvature range.
    """
    # Update the window
    window = theano.shared(T.zeros((window_width, )).eval(), name="window")
    t_mod = T.mod_check(t, window_width)
    updates[window] = T.set_subtensor(window[t_mod], sum(T.sum(T.sqr(g)) for g in grads))
    if debug:
        updates[window] = print_values(updates[window], "window")
    # Get the h_max_t and h_min_t
    t = T.minimum(t + 1, window_width)
    h_max_t = T.max(updates[window][:t])
    h_min_t = T.min(updates[window][:t])
    # Update the moving averages
    h_max = theano.shared(value_floatX(0.0), name="h_max")
    h_min = theano.shared(value_floatX(0.0), name="h_min")
    updates[h_max] = ema(beta, h_max, h_max_t)
    updates[h_min] = ema(beta, h_min, h_min_t)
    return updates[h_max], updates[h_min]


def gradient_variance(grads, params, beta, updates):
    """
    Routine for calculating the variance of the gradients.
    """
    # Total variance
    variance = 0
    for param, grad in zip(params, grads):
        # Make shared variables
        mom1 = shared_mirror(param)
        mom2 = shared_mirror(param)
        # Update moving averages
        updates[mom1] = ema(beta, mom1, grad)
        updates[mom2] = ema(beta, mom2, T.sqr(grad))
        # Update the total variance
        variance += T.sum(T.abs_(updates[mom2] - T.sqr(updates[mom1])))
    return variance


def distance_to_optim(grads, beta, updates):
    """
    Routine fro calculating the distance to the optimum.
    """
    # Had issue with initializing to 0.0, so switched to 1.0
    g = theano.shared(value_floatX(1.0), name="g")
    h = theano.shared(value_floatX(1.0), name="h")
    d = theano.shared(value_floatX(1.0), name="d")
    # L2 norm
    l2_norm = sum(T.sum(T.sqr(g)) for g in grads)
    updates[g] = ema(beta, g, T.sqrt(l2_norm))
    updates[h] = ema(beta, h, l2_norm)
    updates[d] = ema(beta, d, updates[g] / updates[h])
    return updates[d]


def solve(c, d, h_min, debug=False):
    # We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
    # where x = sqrt(mu)
    # Minimising this reduces to solving
    # y^3 + p * y + p = 0
    # y = x - 1
    # p = (D^2 h_min^2)/(2 * C)
    p = (T.sqr(d) * T.sqr(h_min)) / (2 * c)
    w3 = p * (T.sqrt(0.25 + p / 27.0) - 0.5)
    w = T.power(w3, 1.0 / 3.0)
    y = w - p / (3 * w)
    sqrt_mu = y + 1
    if debug:
        value = print_values(y*y*y + p*y + p, "derivative_value")
        sqrt_mu += 1e-20 * value
    return sqrt_mu


def momentum(grads, params, learning_rate, momentum, updates=None):
    """
    Standard momentum - copied from Lasagne library.
    """
    updates = OrderedDict() if updates is None else updates
    velocities = [shared_mirror(p) for p in params]

    for param, grad, v in zip(params, grads, velocities):
        updates[v] = v * momentum - learning_rate * grad
        updates[param] = param + updates[v]

    return updates


def print_values(var, msg):
    """
    Makes an op to print the values of the variable with the message
    """
    return Print(msg)(var)


def ema(alpha, s_t, x_t):
    """
    Exponential moving average
    """
    return alpha * s_t + (1 - alpha) * x_t


def value_floatX(x):
    """
    Converts the value to a numpy array of type theano.config.floatX
    """
    return np.asarray(x).astype(theano.config.floatX)


def shared_mirror(shared):
    """
    Creates a shared variable with same specs as the input.
    """
    value = shared.get_value(borrow=True)
    return theano.shared(np.zeros(value.shape, dtype=value.dtype),
                         broadcastable=shared.broadcastable)
	import numpy as np
	import argparse

	import theano
	import theano.tensor as T
	from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
	from theano.gradient import zero_grad


	import lasagne.layers as L
	import lasagne.nonlinearities as nl
	from lasagne.updates import adam
	from lasagne.random import get_rng


	def run(optimizer, epochs=100, batch_size=250, seed=None):
	if seed is not None:
	np.random.seed(seed)

	dataset = Mnist()
	dataset.load()

	# Build the training function:
	f = build_train_func(optimizer, [784, 512, 512, 64])
	num_batches = int(np.ceil(50000.0 / float(batch_size)))
	results = np.zeros((num_batches * epochs, 1))
	i = 0
	for e in range(epochs):
	for x, _ in dataset.iter("train", batch_size):
	# Sample binarised version
	x = (np.random.rand(*x.shape) < x).astype(theano.config.floatX)
	results[i, 0] = f(x)
	print("[{}]Loss:".format(i), results[i, 0])
	i += 1
	np.savetxt("results.csv", results, delimiter=",")
	print("Done")


	def build_train_func(optimizer, architecture):
	# Input
	l_in = L.InputLayer((None, architecture[0]), name="x")
	# x
	layer = l_in

	# Encoder x -> z
	for i, a in enumerate(architecture[1:-1]):
	name = "encode_{}".format(i + 1)
	layer = L.DenseLayer(layer, a, nonlinearity=nl.tanh, name=name)
	name = "encode_{}".format(len(architecture) - 1)
	layer = L.DenseLayer(layer, 2 * architecture[-1],
	nonlinearity=nl.identity,
	name=name)
	# Q(z\|x)
	l_q = GaussianParameters(layer, name="q")
	# Samples from Q
	layer = GaussianSampler(l_q, name="q_sample")
	# Decoder z -> x
	for i, a in reversed(list(enumerate(architecture[1:-1]))):
	name = "decode_{}".format(i + 1)
	layer = L.DenseLayer(layer, a, nonlinearity=nl.tanh, name=name)
	# P(x\|z)
	layer = L.DenseLayer(layer, architecture[0],
	nonlinearity=nl.sigmoid,
	name="p")
	q_params, prediction = L.get_output((l_q, layer))

	log_p_x = T.nnet.binary_crossentropy(prediction, l_in.input_var)
	log_p_x = T.sum(log_p_x, axis=1)

	kl = guassian_kl(q_params)
	neg_elbo = T.mean(log_p_x + kl)

	params = L.get_all_params(layer, trainable=True)
	if optimizer == "yellow_fin":
	updates = yellow_fin(neg_elbo, params)
	elif optimizer == "adam":
	updates = adam(neg_elbo, params, 0.001)
	else:
	raise NotImplementedError
	return theano.function([l_in.input_var], neg_elbo, updates=updates)


	class GaussianParameters(L.Layer):
	def __init__(self, incoming, nonlinearity=nl.softplus,
	eps=1e-6, **kwargs):
	super(GaussianParameters, self).__init__(incoming, **kwargs)
	self.nonlinearity = nonlinearity
	self.eps = eps

	@staticmethod
	def get_raw(distribution):
	d = T.int_div(distribution.shape[1], 2)
	mu = distribution[:, :d]
	pre_sigma = distribution[:, d:]
	return mu, pre_sigma

	def get_output_for(self, inputs, **kwargs):
	mu, pre_sigma = GaussianParameters.get_raw(inputs)
	sigma = self.nonlinearity(pre_sigma) + self.eps
	return T.concatenate((mu, sigma), axis=1)


	class GaussianSampler(L.Layer):
	def __init__(self, incoming,
	num_samples=1,
	**kwargs):
	super(GaussianSampler, self).__init__(incoming, **kwargs)
	self.num_samples = num_samples

	def get_output_shape_for(self, input_shape):
	shape = input_shape
	assert shape[1] % 2 == 0
	out_shape = (shape[0] * self.num_samples if shape[0] is not None else None, shape[1] // 2)
	return out_shape

	def get_output_for(self, inputs, **kwargs):
	mu, sigma = GaussianParameters.get_raw(inputs)
	n, d = mu.shape
	epsilon = normal((n, self.num_samples, d))
	samples = mu.dimshuffle(0, 'x', 1) + sigma.dimshuffle(0, 'x', 1) * epsilon
	samples = T.reshape(samples, (n * self.num_samples, d))
	return samples


	def guassian_kl(q_params):
	q_mu, q_sigma = GaussianParameters.get_raw(q_params)
	# Trace
	kl = T.sqr(q_sigma)
	# Means difference
	kl += T.sqr(q_mu - 0)
	# Both times inverse of sigma p
	kl /= T.sqr(1)
	# Minus 1
	kl -= T.constant(1)
	# Log determinant of p_sigma
	kl += 2 * T.log(1)
	# Log determinant of q_sigma
	kl -= 2 * T.log(q_sigma)
	# Flatten
	kl = T.flatten(kl, outdim=2)
	# Calculate per data point
	kl = 0.5 * T.sum(kl, axis=1)
	return kl

	def normal(shape):
	random = random or RandomStreams(get_rng().randint(1, 2147462579))
	samples = random.normal(shape, dtype=theano.config.floatX)
	samples = zero_grad(samples)
	return samples

	class Mnist(object):
	"""
	The Mixed National Institute of Standards and Technology Dataset
	"""
	download_url = "http://deeplearning.net/data/mnist/mnist.pkl.gz"

	def __init__(self, flatten=True, path=None):
	self.flatten = flatten
	self.path = "./mnist"
	self.data = dict()

	def iter(self, partition, batch_size, shuffle=True):
	n = self.data[partition][1].shape[0]
	index = np.arange(n)
	if shuffle:
	np.random.shuffle(index)
	for i in range(0, n, batch_size):
	yield (x[index[i: i + batch_size]] for x in self.data[partition])

	def load(self, verbose=True):
	c = 0
	while not self.verify_download():
	self.download()
	c += 1
	if c >= 5:
	raise ValueError("Could not verify download after 5 attempts.")

	file_name = os.path.join(self.path, "mnist.pkl.gz")
	with gzip.open(file_name, 'rb') as f:
	data = load(f, encoding='latin1')
	if self.flatten:
	self.data["train"] = data[0]
	self.data["valid"] = data[1]
	self.data["test"] = data[2]
	else:
	self.data["train"] = [data[0][0].reshape((-1, 28, 28)), data[0][1]]
	self.data["valid"] = [data[1][0].reshape((-1, 28, 28)), data[1][1]]
	self.data["test"] = [data[2][0].reshape((-1, 28, 28)), data[2][1]]

	if verbose:
	print("Train - Images:", self.data["train"][0].shape, "Targets:", self.data["train"][1].shape)
	print("Valid - Images:", self.data["valid"][0].shape, "Targets:", self.data["valid"][1].shape)
	print("Test - Images:", self.data["test"][0].shape, "Targets:", self.data["test"][1].shape)

	def download(self):
	"""
	Downloads the data set from the website.
	:return:
	"""
	if not os.path.exists(self.path):
	os.makedirs(self.path)
	file_path = os.path.join(self.path, "mnist.pkl.gz")
	general_utils.download_file(Mnist.download_url, file_path)

	def verify_download(self):
	"""
	Verifies that the dataset has been downloaded and is present.
	:return:
	"""
	return os.path.exists(os.path.join(self.path, "mnist.pkl.gz"))


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument("optimizer", type=str)
	parser.add_argument("--seed", type=int, default=None)
	parser.add_argument("--epochs", type=int, default=100)
	parser.add_argument("--batch_size", type=int, default=250)
	run(**vars(parser.parse_args()))