Skip to content

Instantly share code, notes, and snippets.

@kastnerkyle
Last active January 12, 2021 13:46
Show Gist options
  • Save kastnerkyle/816134462577399ee8b2 to your computer and use it in GitHub Desktop.
Save kastnerkyle/816134462577399ee8b2 to your computer and use it in GitHub Desktop.
Theano optimizers
# Authors: Kyle Kastner
# License: BSD 3-clause
import theano.tensor as T
import numpy as np
import theano
class rmsprop(object):
"""
RMSProp with nesterov momentum and gradient rescaling
"""
def __init__(self, params):
self.running_square_ = [theano.shared(np.zeros_like(p.get_value()))
for p in params]
self.running_avg_ = [theano.shared(np.zeros_like(p.get_value()))
for p in params]
self.memory_ = [theano.shared(np.zeros_like(p.get_value()))
for p in params]
def updates(self, params, grads, learning_rate, momentum, rescale=5.):
grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
grad_norm = T.sqrt(grad_norm)
scaling_num = rescale
scaling_den = T.maximum(rescale, grad_norm)
# Magic constants
combination_coeff = 0.9
minimum_grad = 1E-4
updates = []
for n, (param, grad) in enumerate(zip(params, grads)):
grad = T.switch(not_finite, 0.1 * param,
grad * (scaling_num / scaling_den))
old_square = self.running_square_[n]
new_square = combination_coeff * old_square + (
1. - combination_coeff) * T.sqr(grad)
old_avg = self.running_avg_[n]
new_avg = combination_coeff * old_avg + (
1. - combination_coeff) * grad
rms_grad = T.sqrt(new_square - new_avg ** 2)
rms_grad = T.maximum(rms_grad, minimum_grad)
memory = self.memory_[n]
update = momentum * memory - learning_rate * grad / rms_grad
update2 = momentum * momentum * memory - (
1 + momentum) * learning_rate * grad / rms_grad
updates.append((old_square, new_square))
updates.append((old_avg, new_avg))
updates.append((memory, update))
updates.append((param, param + update2))
return updates
class sgd_nesterov(object):
def __init__(self, params):
self.memory_ = [theano.shared(np.zeros_like(p.get_value()))
for p in params]
def updates(self, params, grads, learning_rate, momentum):
updates = []
for n, (param, grad) in enumerate(zip(params, grads)):
memory = self.memory_[n]
update = momentum * memory - learning_rate * grad
update2 = momentum * momentum * memory - (
1 + momentum) * learning_rate * grad
updates.append((memory, update))
updates.append((param, param + update2))
return updates
class sgd(object):
# Only here for API conformity with other optimizers
def __init__(self, params):
pass
def updates(self, params, grads, learning_rate):
updates = []
for n, (param, grad) in enumerate(zip(params, grads)):
updates.append((param, param - learning_rate * grad))
return updates
"""
Usage:
grads = T.grad(cost, self.params)
#opt = sgd_nesterov(self.params)
opt = rmsprop(self.params)
updates = opt.updates(self.params, grads,
learning_rate / np.cast['float32'](self.batch_size),
momentum)
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment