-
-
Save erick016/30567f54946cf9e2804db2ab10da5dd4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[1]: | |
#momentum "m_hat" and gradient "g_hat" | |
# In[2]: | |
from tensorflow.python.ops import control_flow_ops | |
from tensorflow.python.ops import math_ops | |
from tensorflow.python.ops import state_ops | |
from tensorflow.python.framework import ops | |
from tensorflow.python.training import optimizer | |
import tensorflow as tf | |
import numpy as np | |
# In[3]: | |
# Gradient Descent | |
#optimizer = tf.optimizers.SGD(learning_rate = 0.1)#.minimize(cost) | |
#loss = lambda: 3 * var1 * var1 + 2 * var2 * var2 | |
epoch_overall_loss = lambda small_lambda,var_epoch_loss_L1,var_epoch_loss_L2,var_epoch_loss_L3: (small_lambda * var_epoch_loss_L1 + | |
((1 - small_lambda) * (var_epoch_loss_L2 + var_epoch_loss_L3))) | |
# ^ use later? ^ | |
#opt.minimize(epoch_overall_loss, var_list=[var_epoch_loss_L1, var_epoch_loss_L2, var_epoch_loss_L3]) | |
#The method minimize() is being called with a "cost" as parameter | |
# ^ consists of the two methods compute_gradients() and then apply_"() ^ | |
# ^ This method relies on the (new) Optimizer (class), which we will | |
# create, to implement the following methods: _create_slots(), _prepare(), | |
#_apply_dense(), and _apply_sparse(). ^ | |
# In[4]: | |
#_create_slots() and _prepare() create and initialise additional | |
# variables, such as momentum. | |
''' | |
def _create_slots(self, var_list): | |
# Create slots for allocation and later management of additional | |
# variables associated with the variables to train. | |
# for example: the first and second moments. | |
for v in var_list: | |
self._zeros_slot(v, "m", self._name) | |
self._zeros_slot(v, "v", self._name) | |
''' | |
# In[5]: | |
''' | |
def _apply_dense(self, grad, var): | |
#define your favourite variable update | |
# for example: | |
# Here we apply gradient descents by substracting the variables | |
# with the gradient times the learning_rate (defined in __init__) | |
var_update = state_ops.assign_sub(var, self.learning_rate * grad) | |
#The trick is now to pass the Ops in the control_flow_ops and | |
# eventually groups any particular computation of the slots your | |
# wish to keep track of: | |
# for example: | |
m_t = ...m... #do something with m and grad | |
v_t = ...v... # do something with v and grad | |
return control_flow_ops.group(*[var_update, m_t, v_t]) | |
''' | |
# In[6]: | |
# v!! THIS IS PROBABLY THE MAIN PART OF WHAT I SHOULD BE PAYING ATTN TO !!v | |
# v For PowerSign the update of the variables works as follows: v | |
# w_(n+1) = w_(n) - alpha(learning_rate) cross g_hat cross... | |
# alpha(theirs) ^ ( f_n(decay_rate) cross sign(g_hat) cross (m_hat)) | |
class Step11_Optimizer(optimizer.Optimizer): | |
"""Implementation of PowerSign. | |
See [Bello et. al., 2017](https://arxiv.org/abs/1709.07417) | |
@@__init__ | |
""" | |
def __init__(self, learning_rate_alpha=0.1,d=1,batch_size=1250, use_locking=False, name="Step11"): | |
super(Step11_Optimizer, self).__init__(use_locking, name) | |
self._lra = learning_rate_alpha | |
self._d = d | |
self._batch_size = batch_size | |
# Tensor versions of the constructor arguments, created in _prepare(). | |
self._lra_tensor = None | |
self._d_tensor = None | |
self._batch_size_tensor = None | |
def _prepare(self): | |
self._lra_tensor = ops.convert_to_tensor(self._lra, name="learning_rate_alpha_tensor") | |
self._d_tensor = ops.convert_to_tensor(self._d, name="d_tensor") #used to be beta, should have been alpha? | |
self._batch_size_tensor = ops.convert_to_tensor(self._batch_size, name="batch_size_tensor") | |
#not using momentum, so won't have to create slots to keep track of vars mid-optimization (?) | |
#d is basically a hyperparametr that we have to update during training | |
#def _create_slots(self, var_list): | |
# Create slots for the first and second moments. | |
# I think this for loop is here because we are trying to put a bunch of momentums on all vars (n-dims) | |
#for v in var_list: | |
#self._zeros_slot(v, "m", self._name) | |
#add_slot | |
def set_changing_hypers(self,d): | |
self._d = d | |
self._d_tensor = ops.convert_to_tensor(self._d, name="d_tensor") | |
def _apply_dense(self, grad, var): | |
lra_tensor = math_ops.cast(self._lra_tensor, var.dtype.base_dtype) | |
d_tensor = math_ops.cast(self._d_tensor, var.dtype.base_dtype) | |
batch_size_tensor = math_ops.cast(self._batch_size_tensor, var.dtype.base_dtype) | |
#eps = 1e-7 #cap for moving average | |
#m = self.get_slot(var, "m") | |
#m_t = m.assign(tf.maximum(beta_t * m + eps, tf.abs(grad))) | |
#var_update = state_ops.assign_sub(var, lr_t*grad*tf.exp( tf.log(alpha_t)*tf.sign(grad)*tf.sign(m_t))) #Update 'ref' by subtracting 'value | |
var_update = state_ops.assign_sub(var, lra_tensor * (d_tensor/batch_size_tensor)) | |
#Create an op that groups multiple operations. | |
#When this op finishes, all ops in input have finished | |
return control_flow_ops.group(*[var_update]) | |
def _apply_sparse(self, grad, var): | |
raise NotImplementedError("Sparse gradient updates are not supported.") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment