Last active
July 13, 2017 13:42
-
-
Save botev/f8b32c00eafee222e47393f7f0747666 to your computer and use it in GitHub Desktop.
Theano Yellow Fin
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import argparse | |
import theano | |
import theano.tensor as T | |
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams | |
from theano.gradient import zero_grad | |
import lasagne.layers as L | |
import lasagne.nonlinearities as nl | |
from lasagne.updates import adam | |
from lasagne.random import get_rng | |
def run(optimizer, epochs=100, batch_size=250, seed=None): | |
if seed is not None: | |
np.random.seed(seed) | |
dataset = Mnist() | |
dataset.load() | |
# Build the training function: | |
f = build_train_func(optimizer, [784, 512, 512, 64]) | |
num_batches = int(np.ceil(50000.0 / float(batch_size))) | |
results = np.zeros((num_batches * epochs, 1)) | |
i = 0 | |
for e in range(epochs): | |
for x, _ in dataset.iter("train", batch_size): | |
# Sample binarised version | |
x = (np.random.rand(*x.shape) < x).astype(theano.config.floatX) | |
results[i, 0] = f(x) | |
print("[{}]Loss:".format(i), results[i, 0]) | |
i += 1 | |
np.savetxt("results.csv", results, delimiter=",") | |
print("Done") | |
def build_train_func(optimizer, architecture): | |
# Input | |
l_in = L.InputLayer((None, architecture[0]), name="x") | |
# x | |
layer = l_in | |
# Encoder x -> z | |
for i, a in enumerate(architecture[1:-1]): | |
name = "encode_{}".format(i + 1) | |
layer = L.DenseLayer(layer, a, nonlinearity=nl.tanh, name=name) | |
name = "encode_{}".format(len(architecture) - 1) | |
layer = L.DenseLayer(layer, 2 * architecture[-1], | |
nonlinearity=nl.identity, | |
name=name) | |
# Q(z|x) | |
l_q = GaussianParameters(layer, name="q") | |
# Samples from Q | |
layer = GaussianSampler(l_q, name="q_sample") | |
# Decoder z -> x | |
for i, a in reversed(list(enumerate(architecture[1:-1]))): | |
name = "decode_{}".format(i + 1) | |
layer = L.DenseLayer(layer, a, nonlinearity=nl.tanh, name=name) | |
# P(x|z) | |
layer = L.DenseLayer(layer, architecture[0], | |
nonlinearity=nl.sigmoid, | |
name="p") | |
q_params, prediction = L.get_output((l_q, layer)) | |
log_p_x = T.nnet.binary_crossentropy(prediction, l_in.input_var) | |
log_p_x = T.sum(log_p_x, axis=1) | |
kl = guassian_kl(q_params) | |
neg_elbo = T.mean(log_p_x + kl) | |
params = L.get_all_params(layer, trainable=True) | |
if optimizer == "yellow_fin": | |
updates = yellow_fin(neg_elbo, params) | |
elif optimizer == "adam": | |
updates = adam(neg_elbo, params, 0.001) | |
else: | |
raise NotImplementedError | |
return theano.function([l_in.input_var], neg_elbo, updates=updates) | |
class GaussianParameters(L.Layer): | |
def __init__(self, incoming, nonlinearity=nl.softplus, | |
eps=1e-6, **kwargs): | |
super(GaussianParameters, self).__init__(incoming, **kwargs) | |
self.nonlinearity = nonlinearity | |
self.eps = eps | |
@staticmethod | |
def get_raw(distribution): | |
d = T.int_div(distribution.shape[1], 2) | |
mu = distribution[:, :d] | |
pre_sigma = distribution[:, d:] | |
return mu, pre_sigma | |
def get_output_for(self, inputs, **kwargs): | |
mu, pre_sigma = GaussianParameters.get_raw(inputs) | |
sigma = self.nonlinearity(pre_sigma) + self.eps | |
return T.concatenate((mu, sigma), axis=1) | |
class GaussianSampler(L.Layer): | |
def __init__(self, incoming, | |
num_samples=1, | |
**kwargs): | |
super(GaussianSampler, self).__init__(incoming, **kwargs) | |
self.num_samples = num_samples | |
def get_output_shape_for(self, input_shape): | |
shape = input_shape | |
assert shape[1] % 2 == 0 | |
out_shape = (shape[0] * self.num_samples if shape[0] is not None else None, shape[1] // 2) | |
return out_shape | |
def get_output_for(self, inputs, **kwargs): | |
mu, sigma = GaussianParameters.get_raw(inputs) | |
n, d = mu.shape | |
epsilon = normal((n, self.num_samples, d)) | |
samples = mu.dimshuffle(0, 'x', 1) + sigma.dimshuffle(0, 'x', 1) * epsilon | |
samples = T.reshape(samples, (n * self.num_samples, d)) | |
return samples | |
def guassian_kl(q_params): | |
q_mu, q_sigma = GaussianParameters.get_raw(q_params) | |
# Trace | |
kl = T.sqr(q_sigma) | |
# Means difference | |
kl += T.sqr(q_mu - 0) | |
# Both times inverse of sigma p | |
kl /= T.sqr(1) | |
# Minus 1 | |
kl -= T.constant(1) | |
# Log determinant of p_sigma | |
kl += 2 * T.log(1) | |
# Log determinant of q_sigma | |
kl -= 2 * T.log(q_sigma) | |
# Flatten | |
kl = T.flatten(kl, outdim=2) | |
# Calculate per data point | |
kl = 0.5 * T.sum(kl, axis=1) | |
return kl | |
def normal(shape): | |
random = random or RandomStreams(get_rng().randint(1, 2147462579)) | |
samples = random.normal(shape, dtype=theano.config.floatX) | |
samples = zero_grad(samples) | |
return samples | |
class Mnist(object): | |
""" | |
The Mixed National Institute of Standards and Technology Dataset | |
""" | |
download_url = "http://deeplearning.net/data/mnist/mnist.pkl.gz" | |
def __init__(self, flatten=True, path=None): | |
self.flatten = flatten | |
self.path = "./mnist" | |
self.data = dict() | |
def iter(self, partition, batch_size, shuffle=True): | |
n = self.data[partition][1].shape[0] | |
index = np.arange(n) | |
if shuffle: | |
np.random.shuffle(index) | |
for i in range(0, n, batch_size): | |
yield (x[index[i: i + batch_size]] for x in self.data[partition]) | |
def load(self, verbose=True): | |
c = 0 | |
while not self.verify_download(): | |
self.download() | |
c += 1 | |
if c >= 5: | |
raise ValueError("Could not verify download after 5 attempts.") | |
file_name = os.path.join(self.path, "mnist.pkl.gz") | |
with gzip.open(file_name, 'rb') as f: | |
data = load(f, encoding='latin1') | |
if self.flatten: | |
self.data["train"] = data[0] | |
self.data["valid"] = data[1] | |
self.data["test"] = data[2] | |
else: | |
self.data["train"] = [data[0][0].reshape((-1, 28, 28)), data[0][1]] | |
self.data["valid"] = [data[1][0].reshape((-1, 28, 28)), data[1][1]] | |
self.data["test"] = [data[2][0].reshape((-1, 28, 28)), data[2][1]] | |
if verbose: | |
print("Train - Images:", self.data["train"][0].shape, "Targets:", self.data["train"][1].shape) | |
print("Valid - Images:", self.data["valid"][0].shape, "Targets:", self.data["valid"][1].shape) | |
print("Test - Images:", self.data["test"][0].shape, "Targets:", self.data["test"][1].shape) | |
def download(self): | |
""" | |
Downloads the data set from the website. | |
:return: | |
""" | |
if not os.path.exists(self.path): | |
os.makedirs(self.path) | |
file_path = os.path.join(self.path, "mnist.pkl.gz") | |
general_utils.download_file(Mnist.download_url, file_path) | |
def verify_download(self): | |
""" | |
Verifies that the dataset has been downloaded and is present. | |
:return: | |
""" | |
return os.path.exists(os.path.join(self.path, "mnist.pkl.gz")) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument("optimizer", type=str) | |
parser.add_argument("--seed", type=int, default=None) | |
parser.add_argument("--epochs", type=int, default=100) | |
parser.add_argument("--batch_size", type=int, default=250) | |
run(**vars(parser.parse_args())) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import theano | |
import theano.tensor as T | |
from theano.printing import Print | |
fx = theano.config.floatX | |
def print_values(var, msg): | |
""" | |
Makes an op to print the values of the variable with the message | |
""" | |
return Print(msg)(var) | |
def solve(c, d, h_min, debug=False): | |
# We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2 | |
# where x = sqrt(mu) | |
# Minimising this reduces to solving | |
# y^3 + p * y + p = 0 | |
# y = x - 1 | |
# p = (D^2 h_min^2)/(2 * C) | |
p = (T.sqr(d) * T.sqr(h_min)) / (T.constant(2) * c) | |
w3 = p * (T.sqrt(0.25 + p / 27.0) - 0.5) | |
w = T.power(w3, 1.0 / 3.0) | |
y = w - p / (3 * w) | |
sqrt_mu = y + 1 | |
if debug: | |
value = print_values(y*y*y + p*y + p, "derivative_value") | |
sqrt_mu += 1e-20 * value | |
return sqrt_mu | |
def solve_np(c, d, h_min): | |
p = (np.square(d) * np.square(h_min)) / np.asarray(2 * c, dtype=fx) | |
coeffs = np.asarray([-1.0, 3.0, - 3 - p, 1.0], dtype=fx) | |
return np.roots(coeffs) | |
if __name__ == "__main__": | |
N = 20 | |
D = 5 | |
c = (np.random.rand(N) * D).astype(fx) | |
d = (np.random.rand(N) * D).astype(fx) | |
h_min = (np.random.rand(N) * D).astype(fx) | |
for i in range(N): | |
x1 = solve(c[i], d[i], h_min[i]).eval().item() | |
x2 = solve_np(c[i], d[i], h_min[i]) | |
ind = np.real(x2) == np.sqrt(x2 * np.conj(x2)) | |
x2 = np.real(x2[ind])[0] | |
print("{:.10f} - {:.10f} = {:.10f}".format(x1, x2, (x1 - x2))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import theano | |
import theano.tensor as T | |
from theano.printing import Print | |
from collections import OrderedDict | |
def yellow_fin(loss, params, beta=0.999, | |
learning_rate_init=1.0, momentum_init=0.0, | |
learning_rate_factor=1.0, t=None, | |
window_width=20, debug=False): | |
""" | |
The Yellow Fin algorithm. | |
:param loss: Theano expression of the loss | |
:param params: List of shared variables | |
:param beta: The moving average smoothing variable | |
:param learning_rate_init: initial learning rate | |
:param momentum_init: initial momentum | |
:padam learning_rate_factor: An additionaly multiplication factor applied to the base learning_rate. | |
:param t: optionally pass your own time variable | |
:param window_width: width of the window for calculating h_max and h_min | |
:param debug: flag for debugging | |
:return: | |
""" | |
grads = T.grad(loss, params) | |
updates = OrderedDict() | |
alpha = theano.shared(value_floatX(learning_rate_init), name="learning_rate") | |
mu = theano.shared(value_floatX(momentum_init), name="momentum") | |
if t is None: | |
t = theano.shared(np.asarray(0).astype(np.int32), name="t") | |
updates[t] = t + 1 | |
# Fetch variables from the routines | |
h_max, h_min = curvature_range(grads, beta, t, updates, window_width) | |
c = gradient_variance(grads, params, beta, updates) | |
d = distance_to_optim(grads, beta, updates) | |
if debug: | |
h_max = print_values(h_max, "h_max") | |
h_min = print_values(h_min, "h_min") | |
c = print_values(c, "c") | |
d = print_values(d, "d") | |
# Get the solution to the minimisation problem for mu | |
sqrt_mu1 = solve(c, d, h_min) | |
if debug: | |
sqrt_mu1 = print_values(sqrt_mu1, "sqrt_mu1") | |
sqrt_mu2 = (T.sqrt(h_max) - T.sqrt(h_min)) / (T.sqrt(h_max) + T.sqrt(h_min)) | |
sqrt_mu = T.maximum(sqrt_mu1, sqrt_mu2) | |
# Given the solution the final mu_t and alpha_t | |
alpha_t = T.sqr(1 - sqrt_mu) / h_min | |
mu_t = T.sqr(sqrt_mu) | |
if debug: | |
mu_t = print_values(mu_t, "mu_t") | |
alpha_t = print_values(alpha_t, "alpha_t") | |
# Update moving averages | |
updates[mu] = ema(beta, mu, mu_t) | |
updates[alpha] = ema(beta, alpha, alpha_t) | |
if debug: | |
updates[mu] = print_values(updates[mu], "mu") | |
updates[alpha] = print_values(updates[alpha], "alpha") | |
# Apply momentum | |
momentum(grads, params, learning_rate_factor * updates[alpha], updates[mu], updates) | |
return updates | |
def curvature_range(grads, beta, t, updates, window_width=20, debug=False): | |
""" | |
Routine for calculating the h_max and h_min curvature range. | |
""" | |
# Update the window | |
window = theano.shared(T.zeros((window_width, )).eval(), name="window") | |
t_mod = T.mod_check(t, window_width) | |
updates[window] = T.set_subtensor(window[t_mod], sum(T.sum(T.sqr(g)) for g in grads)) | |
if debug: | |
updates[window] = print_values(updates[window], "window") | |
# Get the h_max_t and h_min_t | |
t = T.minimum(t + 1, window_width) | |
h_max_t = T.max(updates[window][:t]) | |
h_min_t = T.min(updates[window][:t]) | |
# Update the moving averages | |
h_max = theano.shared(value_floatX(0.0), name="h_max") | |
h_min = theano.shared(value_floatX(0.0), name="h_min") | |
updates[h_max] = ema(beta, h_max, h_max_t) | |
updates[h_min] = ema(beta, h_min, h_min_t) | |
return updates[h_max], updates[h_min] | |
def gradient_variance(grads, params, beta, updates): | |
""" | |
Routine for calculating the variance of the gradients. | |
""" | |
# Total variance | |
variance = 0 | |
for param, grad in zip(params, grads): | |
# Make shared variables | |
mom1 = shared_mirror(param) | |
mom2 = shared_mirror(param) | |
# Update moving averages | |
updates[mom1] = ema(beta, mom1, grad) | |
updates[mom2] = ema(beta, mom2, T.sqr(grad)) | |
# Update the total variance | |
variance += T.sum(T.abs_(updates[mom2] - T.sqr(updates[mom1]))) | |
return variance | |
def distance_to_optim(grads, beta, updates): | |
""" | |
Routine fro calculating the distance to the optimum. | |
""" | |
# Had issue with initializing to 0.0, so switched to 1.0 | |
g = theano.shared(value_floatX(1.0), name="g") | |
h = theano.shared(value_floatX(1.0), name="h") | |
d = theano.shared(value_floatX(1.0), name="d") | |
# L2 norm | |
l2_norm = sum(T.sum(T.sqr(g)) for g in grads) | |
updates[g] = ema(beta, g, T.sqrt(l2_norm)) | |
updates[h] = ema(beta, h, l2_norm) | |
updates[d] = ema(beta, d, updates[g] / updates[h]) | |
return updates[d] | |
def solve(c, d, h_min, debug=False): | |
# We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2 | |
# where x = sqrt(mu) | |
# Minimising this reduces to solving | |
# y^3 + p * y + p = 0 | |
# y = x - 1 | |
# p = (D^2 h_min^2)/(2 * C) | |
p = (T.sqr(d) * T.sqr(h_min)) / (2 * c) | |
w3 = p * (T.sqrt(0.25 + p / 27.0) - 0.5) | |
w = T.power(w3, 1.0 / 3.0) | |
y = w - p / (3 * w) | |
sqrt_mu = y + 1 | |
if debug: | |
value = print_values(y*y*y + p*y + p, "derivative_value") | |
sqrt_mu += 1e-20 * value | |
return sqrt_mu | |
def momentum(grads, params, learning_rate, momentum, updates=None): | |
""" | |
Standard momentum - copied from Lasagne library. | |
""" | |
updates = OrderedDict() if updates is None else updates | |
velocities = [shared_mirror(p) for p in params] | |
for param, grad, v in zip(params, grads, velocities): | |
updates[v] = v * momentum - learning_rate * grad | |
updates[param] = param + updates[v] | |
return updates | |
def print_values(var, msg): | |
""" | |
Makes an op to print the values of the variable with the message | |
""" | |
return Print(msg)(var) | |
def ema(alpha, s_t, x_t): | |
""" | |
Exponential moving average | |
""" | |
return alpha * s_t + (1 - alpha) * x_t | |
def value_floatX(x): | |
""" | |
Converts the value to a numpy array of type theano.config.floatX | |
""" | |
return np.asarray(x).astype(theano.config.floatX) | |
def shared_mirror(shared): | |
""" | |
Creates a shared variable with same specs as the input. | |
""" | |
value = shared.get_value(borrow=True) | |
return theano.shared(np.zeros(value.shape, dtype=value.dtype), | |
broadcastable=shared.broadcastable) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@JianGoForIt
Thanks a lot for stopping by and for the useful comments.
I've also updated this to reflect that and made the learning_rate_init to be
1.0
by default.I've added to the gist a short python script which verifies that the Theano cubic solver is indeed accurate (between 1e-5 -1e-7 difference from np.roots). The solver relies on Vieta's substitution in the cubic equation and the fact that we know that there exists only a single real root for the equation since p > 0.
In this regards I don't get any NaNs or something exploding, it is just that the learning curves are worse than those from Adam. For example, for more elaborate results I suggest to check out the discussion here where I ran the 56-Layer ResNet from a standard Lasagne Recipe and the results I got are that YellowFin seem to underperform both Momentum and Adam.