Skip to content

Instantly share code, notes, and snippets.

@ajbrock
Created March 16, 2017 16:22
Show Gist options
  • Save ajbrock/c341cb8876ab50804ea0929db546e904 to your computer and use it in GitHub Desktop.
Save ajbrock/c341cb8876ab50804ea0929db546e904 to your computer and use it in GitHub Desktop.
import torch
import math
import torch.optim
from torch.optim.optimizer import Optimizer, required
class AdamHD(Optimizer):
"""Implements Adam algorithm.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
"""
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0,beta=1e-8):
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay)
super(AdamHD, self).__init__(params, defaults)
self.beta = beta
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = grad.new().resize_as_(grad).zero_()
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = grad.new().resize_as_(grad).zero_()
state['u'] = grad.new().resize_as_(grad).zero_()
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
# Update hpyergradient
group['lr'] = group['lr'] - self.beta*torch.dot(grad,state['u'])
# lr.add_(-self.beta*grad*(-exp_avg/self.denom
if group['weight_decay'] != 0:
grad = grad.add(group['weight_decay'], p.data)
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
denom = exp_avg_sq.sqrt().add_(group['eps'])
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
step_size = math.sqrt(bias_correction2) / bias_correction1
state['u']= -step_size * exp_avg / denom
p.data.add_(group['lr']*state['u'])
return loss
@damaru2
Copy link

damaru2 commented Dec 11, 2017

This implementation is wrong. The dot product of the gradient and u is being taking component-wise. However, in the method of hypergradients described in Adam: A Method for Stochastic Optimization, the dot product has to be taken using the whole gradient (or its estimator using minibatches) and the whole vector u.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment