ynd/equi.py

## equi.py
import numpy
import theano
from theano import tensor as T
from theano.sandbox import rng_mrg


class EquiSGD(object):
    """Equilibrated SGD (eSGD).

    Parameters
    ----------

    parameters : list
        List of parameters of the model. Must be theano shared variables.

    gradients : list
        List of the gradients w.r.t. each parameter.
    """
    def __init__(self, parameters, gradients):
        self.parameters = parameters
        self.gradients = gradients

        self.memories = [theano.shared(numpy.zeros_like(p.get_value()))
            for p in self.parameters]
        self.diag = [theano.shared(numpy.zeros_like(p.get_value()))
            for p in self.parameters]

        self.rng = rng_mrg.MRG_RandomStreams(numpy.random.randint(2**30))

    def updates(self, learning_rate, momentum, momentum_tp1=None, epsilon=0.0001, decay=0.9):
        """Return the updates for the parameters.

        It returns two updates. The first update is slower because
        it refreshes the estimate of the equilibration matrix. It should be
        called about once every 20 updates. The second update is faster
        because it uses the saved estimate of the equilibration matrix.

        Parameters
        ----------

        learning_rate : float
            Learning rate to use for gradient descent.

        momentum : float
            Momentum rate to use for gradient descent.

        epsilon : float, optional
            Epsilon controls the damping value. The damping is epsilon
            times the biggest value in the diagonal estimate.
        """
        if momentum_tm1 == None:
            momentum_tm1 = momentum
        samples = [self.rng.normal(size=p.shape, avg=0, std=1,
            dtype=theano.config.floatX) for p in self.parameters]

        product = theano.gradient.Lop(self.gradients, self.parameters, samples)

        diag_new = [decay * d + (1 - decay) * p**2 for d, p in
            zip(self.diag, product)]

        diag_updates = zip(self.diag, diag_new)
        slow_updates = zip(self.diag, diag_new)
        for param, grad, memory, diag in \
                zip(self.parameters, self.gradients, self.memories, diag_new):
            update = momentum_tm1 * memory - learning_rate * grad
            update2 = (momentum * momentum_tm1 * memory - (1 + momentum) * learning_rate * grad) / T.sqrt(diag + epsilon)
            slow_updates.append((memory, update))
            slow_updates.append((param, param + update2))

        fast_updates = []
        for param, grad, memory, diag in \
                zip(self.parameters, self.gradients, self.memories, self.diag):
            update = momentum_tm1 * memory - learning_rate * grad
            update2 = (momentum * momentum_tm1 * memory - (1 + momentum) * learning_rate * grad) / T.sqrt(diag + epsilon)
            fast_updates.append((memory, update))
            fast_updates.append((param, param + update2))

        return diag_updates, slow_updates, fast_updates
	import numpy
	import theano
	from theano import tensor as T
	from theano.sandbox import rng_mrg


	class EquiSGD(object):
	"""Equilibrated SGD (eSGD).

	Parameters
	----------

	parameters : list
	List of parameters of the model. Must be theano shared variables.

	gradients : list
	List of the gradients w.r.t. each parameter.
	"""
	def __init__(self, parameters, gradients):
	self.parameters = parameters
	self.gradients = gradients

	self.memories = [theano.shared(numpy.zeros_like(p.get_value()))
	for p in self.parameters]
	self.diag = [theano.shared(numpy.zeros_like(p.get_value()))
	for p in self.parameters]

	self.rng = rng_mrg.MRG_RandomStreams(numpy.random.randint(2**30))

	def updates(self, learning_rate, momentum, momentum_tp1=None, epsilon=0.0001, decay=0.9):
	"""Return the updates for the parameters.

	It returns two updates. The first update is slower because
	it refreshes the estimate of the equilibration matrix. It should be
	called about once every 20 updates. The second update is faster
	because it uses the saved estimate of the equilibration matrix.

	Parameters
	----------

	learning_rate : float
	Learning rate to use for gradient descent.

	momentum : float
	Momentum rate to use for gradient descent.

	epsilon : float, optional
	Epsilon controls the damping value. The damping is epsilon
	times the biggest value in the diagonal estimate.
	"""
	if momentum_tm1 == None:
	momentum_tm1 = momentum
	samples = [self.rng.normal(size=p.shape, avg=0, std=1,
	dtype=theano.config.floatX) for p in self.parameters]

	product = theano.gradient.Lop(self.gradients, self.parameters, samples)

	diag_new = [decay * d + (1 - decay) * p**2 for d, p in
	zip(self.diag, product)]

	diag_updates = zip(self.diag, diag_new)
	slow_updates = zip(self.diag, diag_new)
	for param, grad, memory, diag in \
	zip(self.parameters, self.gradients, self.memories, diag_new):
	update = momentum_tm1 * memory - learning_rate * grad
	update2 = (momentum * momentum_tm1 * memory - (1 + momentum) * learning_rate * grad) / T.sqrt(diag + epsilon)
	slow_updates.append((memory, update))
	slow_updates.append((param, param + update2))

	fast_updates = []
	for param, grad, memory, diag in \
	zip(self.parameters, self.gradients, self.memories, self.diag):
	update = momentum_tm1 * memory - learning_rate * grad
	update2 = (momentum * momentum_tm1 * memory - (1 + momentum) * learning_rate * grad) / T.sqrt(diag + epsilon)
	fast_updates.append((memory, update))
	fast_updates.append((param, param + update2))

	return diag_updates, slow_updates, fast_updates