Skip to content

Instantly share code, notes, and snippets.

@botev
Last active July 13, 2017 13:42
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save botev/f8b32c00eafee222e47393f7f0747666 to your computer and use it in GitHub Desktop.
Save botev/f8b32c00eafee222e47393f7f0747666 to your computer and use it in GitHub Desktop.
Theano Yellow Fin
import numpy as np
import argparse
import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.gradient import zero_grad
import lasagne.layers as L
import lasagne.nonlinearities as nl
from lasagne.updates import adam
from lasagne.random import get_rng
def run(optimizer, epochs=100, batch_size=250, seed=None):
if seed is not None:
np.random.seed(seed)
dataset = Mnist()
dataset.load()
# Build the training function:
f = build_train_func(optimizer, [784, 512, 512, 64])
num_batches = int(np.ceil(50000.0 / float(batch_size)))
results = np.zeros((num_batches * epochs, 1))
i = 0
for e in range(epochs):
for x, _ in dataset.iter("train", batch_size):
# Sample binarised version
x = (np.random.rand(*x.shape) < x).astype(theano.config.floatX)
results[i, 0] = f(x)
print("[{}]Loss:".format(i), results[i, 0])
i += 1
np.savetxt("results.csv", results, delimiter=",")
print("Done")
def build_train_func(optimizer, architecture):
# Input
l_in = L.InputLayer((None, architecture[0]), name="x")
# x
layer = l_in
# Encoder x -> z
for i, a in enumerate(architecture[1:-1]):
name = "encode_{}".format(i + 1)
layer = L.DenseLayer(layer, a, nonlinearity=nl.tanh, name=name)
name = "encode_{}".format(len(architecture) - 1)
layer = L.DenseLayer(layer, 2 * architecture[-1],
nonlinearity=nl.identity,
name=name)
# Q(z|x)
l_q = GaussianParameters(layer, name="q")
# Samples from Q
layer = GaussianSampler(l_q, name="q_sample")
# Decoder z -> x
for i, a in reversed(list(enumerate(architecture[1:-1]))):
name = "decode_{}".format(i + 1)
layer = L.DenseLayer(layer, a, nonlinearity=nl.tanh, name=name)
# P(x|z)
layer = L.DenseLayer(layer, architecture[0],
nonlinearity=nl.sigmoid,
name="p")
q_params, prediction = L.get_output((l_q, layer))
log_p_x = T.nnet.binary_crossentropy(prediction, l_in.input_var)
log_p_x = T.sum(log_p_x, axis=1)
kl = guassian_kl(q_params)
neg_elbo = T.mean(log_p_x + kl)
params = L.get_all_params(layer, trainable=True)
if optimizer == "yellow_fin":
updates = yellow_fin(neg_elbo, params)
elif optimizer == "adam":
updates = adam(neg_elbo, params, 0.001)
else:
raise NotImplementedError
return theano.function([l_in.input_var], neg_elbo, updates=updates)
class GaussianParameters(L.Layer):
def __init__(self, incoming, nonlinearity=nl.softplus,
eps=1e-6, **kwargs):
super(GaussianParameters, self).__init__(incoming, **kwargs)
self.nonlinearity = nonlinearity
self.eps = eps
@staticmethod
def get_raw(distribution):
d = T.int_div(distribution.shape[1], 2)
mu = distribution[:, :d]
pre_sigma = distribution[:, d:]
return mu, pre_sigma
def get_output_for(self, inputs, **kwargs):
mu, pre_sigma = GaussianParameters.get_raw(inputs)
sigma = self.nonlinearity(pre_sigma) + self.eps
return T.concatenate((mu, sigma), axis=1)
class GaussianSampler(L.Layer):
def __init__(self, incoming,
num_samples=1,
**kwargs):
super(GaussianSampler, self).__init__(incoming, **kwargs)
self.num_samples = num_samples
def get_output_shape_for(self, input_shape):
shape = input_shape
assert shape[1] % 2 == 0
out_shape = (shape[0] * self.num_samples if shape[0] is not None else None, shape[1] // 2)
return out_shape
def get_output_for(self, inputs, **kwargs):
mu, sigma = GaussianParameters.get_raw(inputs)
n, d = mu.shape
epsilon = normal((n, self.num_samples, d))
samples = mu.dimshuffle(0, 'x', 1) + sigma.dimshuffle(0, 'x', 1) * epsilon
samples = T.reshape(samples, (n * self.num_samples, d))
return samples
def guassian_kl(q_params):
q_mu, q_sigma = GaussianParameters.get_raw(q_params)
# Trace
kl = T.sqr(q_sigma)
# Means difference
kl += T.sqr(q_mu - 0)
# Both times inverse of sigma p
kl /= T.sqr(1)
# Minus 1
kl -= T.constant(1)
# Log determinant of p_sigma
kl += 2 * T.log(1)
# Log determinant of q_sigma
kl -= 2 * T.log(q_sigma)
# Flatten
kl = T.flatten(kl, outdim=2)
# Calculate per data point
kl = 0.5 * T.sum(kl, axis=1)
return kl
def normal(shape):
random = random or RandomStreams(get_rng().randint(1, 2147462579))
samples = random.normal(shape, dtype=theano.config.floatX)
samples = zero_grad(samples)
return samples
class Mnist(object):
"""
The Mixed National Institute of Standards and Technology Dataset
"""
download_url = "http://deeplearning.net/data/mnist/mnist.pkl.gz"
def __init__(self, flatten=True, path=None):
self.flatten = flatten
self.path = "./mnist"
self.data = dict()
def iter(self, partition, batch_size, shuffle=True):
n = self.data[partition][1].shape[0]
index = np.arange(n)
if shuffle:
np.random.shuffle(index)
for i in range(0, n, batch_size):
yield (x[index[i: i + batch_size]] for x in self.data[partition])
def load(self, verbose=True):
c = 0
while not self.verify_download():
self.download()
c += 1
if c >= 5:
raise ValueError("Could not verify download after 5 attempts.")
file_name = os.path.join(self.path, "mnist.pkl.gz")
with gzip.open(file_name, 'rb') as f:
data = load(f, encoding='latin1')
if self.flatten:
self.data["train"] = data[0]
self.data["valid"] = data[1]
self.data["test"] = data[2]
else:
self.data["train"] = [data[0][0].reshape((-1, 28, 28)), data[0][1]]
self.data["valid"] = [data[1][0].reshape((-1, 28, 28)), data[1][1]]
self.data["test"] = [data[2][0].reshape((-1, 28, 28)), data[2][1]]
if verbose:
print("Train - Images:", self.data["train"][0].shape, "Targets:", self.data["train"][1].shape)
print("Valid - Images:", self.data["valid"][0].shape, "Targets:", self.data["valid"][1].shape)
print("Test - Images:", self.data["test"][0].shape, "Targets:", self.data["test"][1].shape)
def download(self):
"""
Downloads the data set from the website.
:return:
"""
if not os.path.exists(self.path):
os.makedirs(self.path)
file_path = os.path.join(self.path, "mnist.pkl.gz")
general_utils.download_file(Mnist.download_url, file_path)
def verify_download(self):
"""
Verifies that the dataset has been downloaded and is present.
:return:
"""
return os.path.exists(os.path.join(self.path, "mnist.pkl.gz"))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("optimizer", type=str)
parser.add_argument("--seed", type=int, default=None)
parser.add_argument("--epochs", type=int, default=100)
parser.add_argument("--batch_size", type=int, default=250)
run(**vars(parser.parse_args()))
import numpy as np
import theano
import theano.tensor as T
from theano.printing import Print
fx = theano.config.floatX
def print_values(var, msg):
"""
Makes an op to print the values of the variable with the message
"""
return Print(msg)(var)
def solve(c, d, h_min, debug=False):
# We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
# where x = sqrt(mu)
# Minimising this reduces to solving
# y^3 + p * y + p = 0
# y = x - 1
# p = (D^2 h_min^2)/(2 * C)
p = (T.sqr(d) * T.sqr(h_min)) / (T.constant(2) * c)
w3 = p * (T.sqrt(0.25 + p / 27.0) - 0.5)
w = T.power(w3, 1.0 / 3.0)
y = w - p / (3 * w)
sqrt_mu = y + 1
if debug:
value = print_values(y*y*y + p*y + p, "derivative_value")
sqrt_mu += 1e-20 * value
return sqrt_mu
def solve_np(c, d, h_min):
p = (np.square(d) * np.square(h_min)) / np.asarray(2 * c, dtype=fx)
coeffs = np.asarray([-1.0, 3.0, - 3 - p, 1.0], dtype=fx)
return np.roots(coeffs)
if __name__ == "__main__":
N = 20
D = 5
c = (np.random.rand(N) * D).astype(fx)
d = (np.random.rand(N) * D).astype(fx)
h_min = (np.random.rand(N) * D).astype(fx)
for i in range(N):
x1 = solve(c[i], d[i], h_min[i]).eval().item()
x2 = solve_np(c[i], d[i], h_min[i])
ind = np.real(x2) == np.sqrt(x2 * np.conj(x2))
x2 = np.real(x2[ind])[0]
print("{:.10f} - {:.10f} = {:.10f}".format(x1, x2, (x1 - x2)))
import numpy as np
import theano
import theano.tensor as T
from theano.printing import Print
from collections import OrderedDict
def yellow_fin(loss, params, beta=0.999,
learning_rate_init=1.0, momentum_init=0.0,
learning_rate_factor=1.0, t=None,
window_width=20, debug=False):
"""
The Yellow Fin algorithm.
:param loss: Theano expression of the loss
:param params: List of shared variables
:param beta: The moving average smoothing variable
:param learning_rate_init: initial learning rate
:param momentum_init: initial momentum
:padam learning_rate_factor: An additionaly multiplication factor applied to the base learning_rate.
:param t: optionally pass your own time variable
:param window_width: width of the window for calculating h_max and h_min
:param debug: flag for debugging
:return:
"""
grads = T.grad(loss, params)
updates = OrderedDict()
alpha = theano.shared(value_floatX(learning_rate_init), name="learning_rate")
mu = theano.shared(value_floatX(momentum_init), name="momentum")
if t is None:
t = theano.shared(np.asarray(0).astype(np.int32), name="t")
updates[t] = t + 1
# Fetch variables from the routines
h_max, h_min = curvature_range(grads, beta, t, updates, window_width)
c = gradient_variance(grads, params, beta, updates)
d = distance_to_optim(grads, beta, updates)
if debug:
h_max = print_values(h_max, "h_max")
h_min = print_values(h_min, "h_min")
c = print_values(c, "c")
d = print_values(d, "d")
# Get the solution to the minimisation problem for mu
sqrt_mu1 = solve(c, d, h_min)
if debug:
sqrt_mu1 = print_values(sqrt_mu1, "sqrt_mu1")
sqrt_mu2 = (T.sqrt(h_max) - T.sqrt(h_min)) / (T.sqrt(h_max) + T.sqrt(h_min))
sqrt_mu = T.maximum(sqrt_mu1, sqrt_mu2)
# Given the solution the final mu_t and alpha_t
alpha_t = T.sqr(1 - sqrt_mu) / h_min
mu_t = T.sqr(sqrt_mu)
if debug:
mu_t = print_values(mu_t, "mu_t")
alpha_t = print_values(alpha_t, "alpha_t")
# Update moving averages
updates[mu] = ema(beta, mu, mu_t)
updates[alpha] = ema(beta, alpha, alpha_t)
if debug:
updates[mu] = print_values(updates[mu], "mu")
updates[alpha] = print_values(updates[alpha], "alpha")
# Apply momentum
momentum(grads, params, learning_rate_factor * updates[alpha], updates[mu], updates)
return updates
def curvature_range(grads, beta, t, updates, window_width=20, debug=False):
"""
Routine for calculating the h_max and h_min curvature range.
"""
# Update the window
window = theano.shared(T.zeros((window_width, )).eval(), name="window")
t_mod = T.mod_check(t, window_width)
updates[window] = T.set_subtensor(window[t_mod], sum(T.sum(T.sqr(g)) for g in grads))
if debug:
updates[window] = print_values(updates[window], "window")
# Get the h_max_t and h_min_t
t = T.minimum(t + 1, window_width)
h_max_t = T.max(updates[window][:t])
h_min_t = T.min(updates[window][:t])
# Update the moving averages
h_max = theano.shared(value_floatX(0.0), name="h_max")
h_min = theano.shared(value_floatX(0.0), name="h_min")
updates[h_max] = ema(beta, h_max, h_max_t)
updates[h_min] = ema(beta, h_min, h_min_t)
return updates[h_max], updates[h_min]
def gradient_variance(grads, params, beta, updates):
"""
Routine for calculating the variance of the gradients.
"""
# Total variance
variance = 0
for param, grad in zip(params, grads):
# Make shared variables
mom1 = shared_mirror(param)
mom2 = shared_mirror(param)
# Update moving averages
updates[mom1] = ema(beta, mom1, grad)
updates[mom2] = ema(beta, mom2, T.sqr(grad))
# Update the total variance
variance += T.sum(T.abs_(updates[mom2] - T.sqr(updates[mom1])))
return variance
def distance_to_optim(grads, beta, updates):
"""
Routine fro calculating the distance to the optimum.
"""
# Had issue with initializing to 0.0, so switched to 1.0
g = theano.shared(value_floatX(1.0), name="g")
h = theano.shared(value_floatX(1.0), name="h")
d = theano.shared(value_floatX(1.0), name="d")
# L2 norm
l2_norm = sum(T.sum(T.sqr(g)) for g in grads)
updates[g] = ema(beta, g, T.sqrt(l2_norm))
updates[h] = ema(beta, h, l2_norm)
updates[d] = ema(beta, d, updates[g] / updates[h])
return updates[d]
def solve(c, d, h_min, debug=False):
# We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
# where x = sqrt(mu)
# Minimising this reduces to solving
# y^3 + p * y + p = 0
# y = x - 1
# p = (D^2 h_min^2)/(2 * C)
p = (T.sqr(d) * T.sqr(h_min)) / (2 * c)
w3 = p * (T.sqrt(0.25 + p / 27.0) - 0.5)
w = T.power(w3, 1.0 / 3.0)
y = w - p / (3 * w)
sqrt_mu = y + 1
if debug:
value = print_values(y*y*y + p*y + p, "derivative_value")
sqrt_mu += 1e-20 * value
return sqrt_mu
def momentum(grads, params, learning_rate, momentum, updates=None):
"""
Standard momentum - copied from Lasagne library.
"""
updates = OrderedDict() if updates is None else updates
velocities = [shared_mirror(p) for p in params]
for param, grad, v in zip(params, grads, velocities):
updates[v] = v * momentum - learning_rate * grad
updates[param] = param + updates[v]
return updates
def print_values(var, msg):
"""
Makes an op to print the values of the variable with the message
"""
return Print(msg)(var)
def ema(alpha, s_t, x_t):
"""
Exponential moving average
"""
return alpha * s_t + (1 - alpha) * x_t
def value_floatX(x):
"""
Converts the value to a numpy array of type theano.config.floatX
"""
return np.asarray(x).astype(theano.config.floatX)
def shared_mirror(shared):
"""
Creates a shared variable with same specs as the input.
"""
value = shared.get_value(borrow=True)
return theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=shared.broadcastable)
@JianGoForIt
Copy link

Hi @botev,
Here is the author of the YellowFin and PyTorch version author.
Thanks for the implementation. I read your post on Reddit and I have noticed a few things need to be checked (might be more points than I noticed).

  1. The smoothing coefficient might be sub-optimal, we use 0.999 instead.

  2. Not really sure whether the cubic solution is correct or not. Could you please verify the solution is in [0, 1]. In TF and PyTorch implementation, we use the numpy root function.

Just a quick suggestion, maybe you can construct some simple example for both the PyTorch version and your implementation, so that you can compare intermediate statistics and etc.

Another quick question, what is the phenomenon you observe on this implementation so that it is not better than Adam? Is there an explosion? If yes, you may want to follow up the thread here.

Hope it helps and please keep us updated about the results.

@botev
Copy link
Author

botev commented Jul 13, 2017

@JianGoForIt
Thanks a lot for stopping by and for the useful comments.

The smoothing coefficient might be suboptimal, we use 0.999 instead.

I've also updated this to reflect that and made the learning_rate_init to be 1.0 by default.

Not really sure whether the cubic solution is correct or not. Could you please verify the solution is in [0, 1]. In TF and PyTorch implementation, we use the numpy root function.

I've added to the gist a short python script which verifies that the Theano cubic solver is indeed accurate (between 1e-5 -1e-7 difference from np.roots). The solver relies on Vieta's substitution in the cubic equation and the fact that we know that there exists only a single real root for the equation since p > 0.

Another quick question, what is the phenomenon you observe on this implementation so that it is not better than Adam? Is there an explosion? If yes, you may want to follow up the thread here.

In this regards I don't get any NaNs or something exploding, it is just that the learning curves are worse than those from Adam. For example, for more elaborate results I suggest to check out the discussion here where I ran the 56-Layer ResNet from a standard Lasagne Recipe and the results I got are that YellowFin seem to underperform both Momentum and Adam.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment