Skip to content

Instantly share code, notes, and snippets.

@erick016
Created September 18, 2021 23:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save erick016/30567f54946cf9e2804db2ab10da5dd4 to your computer and use it in GitHub Desktop.
Save erick016/30567f54946cf9e2804db2ab10da5dd4 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
# In[1]:
#momentum "m_hat" and gradient "g_hat"
# In[2]:
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.framework import ops
from tensorflow.python.training import optimizer
import tensorflow as tf
import numpy as np
# In[3]:
# Gradient Descent
#optimizer = tf.optimizers.SGD(learning_rate = 0.1)#.minimize(cost)
#loss = lambda: 3 * var1 * var1 + 2 * var2 * var2
epoch_overall_loss = lambda small_lambda,var_epoch_loss_L1,var_epoch_loss_L2,var_epoch_loss_L3: (small_lambda * var_epoch_loss_L1 +
((1 - small_lambda) * (var_epoch_loss_L2 + var_epoch_loss_L3)))
# ^ use later? ^
#opt.minimize(epoch_overall_loss, var_list=[var_epoch_loss_L1, var_epoch_loss_L2, var_epoch_loss_L3])
#The method minimize() is being called with a "cost" as parameter
# ^ consists of the two methods compute_gradients() and then apply_"() ^
# ^ This method relies on the (new) Optimizer (class), which we will
# create, to implement the following methods: _create_slots(), _prepare(),
#_apply_dense(), and _apply_sparse(). ^
# In[4]:
#_create_slots() and _prepare() create and initialise additional
# variables, such as momentum.
'''
def _create_slots(self, var_list):
# Create slots for allocation and later management of additional
# variables associated with the variables to train.
# for example: the first and second moments.
for v in var_list:
self._zeros_slot(v, "m", self._name)
self._zeros_slot(v, "v", self._name)
'''
# In[5]:
'''
def _apply_dense(self, grad, var):
#define your favourite variable update
# for example:
# Here we apply gradient descents by substracting the variables
# with the gradient times the learning_rate (defined in __init__)
var_update = state_ops.assign_sub(var, self.learning_rate * grad)
#The trick is now to pass the Ops in the control_flow_ops and
# eventually groups any particular computation of the slots your
# wish to keep track of:
# for example:
m_t = ...m... #do something with m and grad
v_t = ...v... # do something with v and grad
return control_flow_ops.group(*[var_update, m_t, v_t])
'''
# In[6]:
# v!! THIS IS PROBABLY THE MAIN PART OF WHAT I SHOULD BE PAYING ATTN TO !!v
# v For PowerSign the update of the variables works as follows: v
# w_(n+1) = w_(n) - alpha(learning_rate) cross g_hat cross...
# alpha(theirs) ^ ( f_n(decay_rate) cross sign(g_hat) cross (m_hat))
class Step11_Optimizer(optimizer.Optimizer):
"""Implementation of PowerSign.
See [Bello et. al., 2017](https://arxiv.org/abs/1709.07417)
@@__init__
"""
def __init__(self, learning_rate_alpha=0.1,d=1,batch_size=1250, use_locking=False, name="Step11"):
super(Step11_Optimizer, self).__init__(use_locking, name)
self._lra = learning_rate_alpha
self._d = d
self._batch_size = batch_size
# Tensor versions of the constructor arguments, created in _prepare().
self._lra_tensor = None
self._d_tensor = None
self._batch_size_tensor = None
def _prepare(self):
self._lra_tensor = ops.convert_to_tensor(self._lra, name="learning_rate_alpha_tensor")
self._d_tensor = ops.convert_to_tensor(self._d, name="d_tensor") #used to be beta, should have been alpha?
self._batch_size_tensor = ops.convert_to_tensor(self._batch_size, name="batch_size_tensor")
#not using momentum, so won't have to create slots to keep track of vars mid-optimization (?)
#d is basically a hyperparametr that we have to update during training
#def _create_slots(self, var_list):
# Create slots for the first and second moments.
# I think this for loop is here because we are trying to put a bunch of momentums on all vars (n-dims)
#for v in var_list:
#self._zeros_slot(v, "m", self._name)
#add_slot
def set_changing_hypers(self,d):
self._d = d
self._d_tensor = ops.convert_to_tensor(self._d, name="d_tensor")
def _apply_dense(self, grad, var):
lra_tensor = math_ops.cast(self._lra_tensor, var.dtype.base_dtype)
d_tensor = math_ops.cast(self._d_tensor, var.dtype.base_dtype)
batch_size_tensor = math_ops.cast(self._batch_size_tensor, var.dtype.base_dtype)
#eps = 1e-7 #cap for moving average
#m = self.get_slot(var, "m")
#m_t = m.assign(tf.maximum(beta_t * m + eps, tf.abs(grad)))
#var_update = state_ops.assign_sub(var, lr_t*grad*tf.exp( tf.log(alpha_t)*tf.sign(grad)*tf.sign(m_t))) #Update 'ref' by subtracting 'value
var_update = state_ops.assign_sub(var, lra_tensor * (d_tensor/batch_size_tensor))
#Create an op that groups multiple operations.
#When this op finishes, all ops in input have finished
return control_flow_ops.group(*[var_update])
def _apply_sparse(self, grad, var):
raise NotImplementedError("Sparse gradient updates are not supported.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment