Skip to content

Instantly share code, notes, and snippets.

@ynd
Last active November 17, 2022 09:41
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save ynd/f1ce7133a03ec54d6eb9 to your computer and use it in GitHub Desktop.
Save ynd/f1ce7133a03ec54d6eb9 to your computer and use it in GitHub Desktop.
import numpy
import theano
from theano import tensor as T
from theano.sandbox import rng_mrg
class EquiSGD(object):
"""Equilibrated SGD (eSGD).
Parameters
----------
parameters : list
List of parameters of the model. Must be theano shared variables.
gradients : list
List of the gradients w.r.t. each parameter.
"""
def __init__(self, parameters, gradients):
self.parameters = parameters
self.gradients = gradients
self.memories = [theano.shared(numpy.zeros_like(p.get_value()))
for p in self.parameters]
self.diag = [theano.shared(numpy.zeros_like(p.get_value()))
for p in self.parameters]
self.rng = rng_mrg.MRG_RandomStreams(numpy.random.randint(2**30))
def updates(self, learning_rate, momentum, momentum_tp1=None, epsilon=0.0001, decay=0.9):
"""Return the updates for the parameters.
It returns two updates. The first update is slower because
it refreshes the estimate of the equilibration matrix. It should be
called about once every 20 updates. The second update is faster
because it uses the saved estimate of the equilibration matrix.
Parameters
----------
learning_rate : float
Learning rate to use for gradient descent.
momentum : float
Momentum rate to use for gradient descent.
epsilon : float, optional
Epsilon controls the damping value. The damping is epsilon
times the biggest value in the diagonal estimate.
"""
if momentum_tm1 == None:
momentum_tm1 = momentum
samples = [self.rng.normal(size=p.shape, avg=0, std=1,
dtype=theano.config.floatX) for p in self.parameters]
product = theano.gradient.Lop(self.gradients, self.parameters, samples)
diag_new = [decay * d + (1 - decay) * p**2 for d, p in
zip(self.diag, product)]
diag_updates = zip(self.diag, diag_new)
slow_updates = zip(self.diag, diag_new)
for param, grad, memory, diag in \
zip(self.parameters, self.gradients, self.memories, diag_new):
update = momentum_tm1 * memory - learning_rate * grad
update2 = (momentum * momentum_tm1 * memory - (1 + momentum) * learning_rate * grad) / T.sqrt(diag + epsilon)
slow_updates.append((memory, update))
slow_updates.append((param, param + update2))
fast_updates = []
for param, grad, memory, diag in \
zip(self.parameters, self.gradients, self.memories, self.diag):
update = momentum_tm1 * memory - learning_rate * grad
update2 = (momentum * momentum_tm1 * memory - (1 + momentum) * learning_rate * grad) / T.sqrt(diag + epsilon)
fast_updates.append((memory, update))
fast_updates.append((param, param + update2))
return diag_updates, slow_updates, fast_updates
@rodrigob
Copy link

Is it there any example usage of this code ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment