czotti/rmsprop.py

## rmsprop.py
def get_updates(self, learning_rate, grads, lr_scalers=None):
        """
        Provides the symbolic (theano) description of the updates needed to
        perform this learning rule. See Notes for side-effects.
        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        Returns
        -------
        updates : OrderdDict
            A dictionary mapping from the old model parameters, to their new
            values after a single iteration of the learning rule.
        Notes
        -----
        This method has the side effect of storing the moving average
        of the square gradient in `self.mean_square_grads`. This is
        necessary in order for the monitoring channels to be able
        to track the value of these moving averages.
        Therefore, this method should only get called once for each
        instance of RMSProp.
        """

        updates = OrderedDict()
        for param in grads:

            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(param.get_value() * 0.)

            if param.name is None:
                raise ValueError("Model parameters must be named.")
            mean_square_grad.name = 'mean_square_grad_' + param.name

            if param.name in self.mean_square_grads:
                warnings.warn("Calling get_updates more than once on the "
                              "gradients of `%s` may make monitored values "
                              "incorrect." % param.name)
            # Store variable in self.mean_square_grads for monitoring.
            self.mean_square_grads[param.name] = mean_square_grad

            # Accumulate gradient
            new_mean_squared_grad = (self.decay * mean_square_grad +
                                     (1 - self.decay) * T.sqr(grads[param]))

            # Compute update
            scaled_lr = lr_scalers.get(param, 1.) * learning_rate
            rms_grad_t = T.sqrt(new_mean_squared_grad)
            rms_grad_t = T.maximum(rms_grad_t, self.epsilon)
            delta_x_t = - scaled_lr * grads[param] / rms_grad_t

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[param] = param + delta_x_t

        return updates
	def get_updates(self, learning_rate, grads, lr_scalers=None):
	"""
	Provides the symbolic (theano) description of the updates needed to
	perform this learning rule. See Notes for side-effects.
	Parameters
	----------
	learning_rate : float
	Learning rate coefficient.
	grads : dict
	A dictionary mapping from the model's parameters to their
	gradients.
	lr_scalers : dict
	A dictionary mapping from the model's parameters to a learning
	rate multiplier.
	Returns
	-------
	updates : OrderdDict
	A dictionary mapping from the old model parameters, to their new
	values after a single iteration of the learning rule.
	Notes
	-----
	This method has the side effect of storing the moving average
	of the square gradient in `self.mean_square_grads`. This is
	necessary in order for the monitoring channels to be able
	to track the value of these moving averages.
	Therefore, this method should only get called once for each
	instance of RMSProp.
	"""

	updates = OrderedDict()
	for param in grads:

	# mean_squared_grad := E[g^2]_{t-1}
	mean_square_grad = sharedX(param.get_value() * 0.)

	if param.name is None:
	raise ValueError("Model parameters must be named.")
	mean_square_grad.name = 'mean_square_grad_' + param.name

	if param.name in self.mean_square_grads:
	warnings.warn("Calling get_updates more than once on the "
	"gradients of `%s` may make monitored values "
	"incorrect." % param.name)
	# Store variable in self.mean_square_grads for monitoring.
	self.mean_square_grads[param.name] = mean_square_grad

	# Accumulate gradient
	new_mean_squared_grad = (self.decay * mean_square_grad +
	(1 - self.decay) * T.sqr(grads[param]))

	# Compute update
	scaled_lr = lr_scalers.get(param, 1.) * learning_rate
	rms_grad_t = T.sqrt(new_mean_squared_grad)
	rms_grad_t = T.maximum(rms_grad_t, self.epsilon)
	delta_x_t = - scaled_lr * grads[param] / rms_grad_t

	# Apply update
	updates[mean_square_grad] = new_mean_squared_grad
	updates[param] = param + delta_x_t

	return updates