Skip to content

Instantly share code, notes, and snippets.

@meikuam
Created April 9, 2019 04:29
Show Gist options
  • Save meikuam/b1ab37f2034d0e5a91d86a517c4b29be to your computer and use it in GitHub Desktop.
Save meikuam/b1ab37f2034d0e5a91d86a517c4b29be to your computer and use it in GitHub Desktop.
Optimization algorithms
import numpy as np
# SGD
while True:
#....
dx = network.backward()
x = x - learning_rate * dx
# Nesterov Momentum update
while True:
#....
dx = network.backward()
v = mu * v - learning_rate * dx # intergate velocity
x = x + v
# Nesterov Momentum update rewrite
while True:
#....
dx = network.backward()
v_prev = v
v = mu * v - learning_rate * dx # intergate velocity
x = x - mu * v_prev + (1 + mu) * v
# AdaGrad update
# added element-wise scaling of the gradient basen on the historical sum of squares in each dim
while True:
#....
dx = network.backward()
cache += dx**2
x = x - learning_rate * dx / (np.sqrt(cache) + 1e-7)
# RMSProp update
while True:
#....
dx = network.backward()
cache = decay_rate * cache + (1 - decay_rate) * dx**2
x = x - learning_rate * dx / (np.sqrt(cache) + 1e-7)
# Adam update
while True:
#....
dx = network.backward()
m = beta1 * m + (1 - beta1) * dx
v = beta2 * v + (1 - beta2) * (dx**2)
x = x - learning_rate * m / (np.sqrt(v) + 1e-7)
# Adam update with bias correction
beta1 = 0.9
beta2 = 0.995
m, v = # initialize caches to zeros
for t in range(0, num_of_iterations):
#....
dx = network.backward()
m = beta1 * m + (1 - beta1) * dx # update first moment
v = beta2 * v + (1 - beta2) * (dx**2) # update second moment
m /= 1 - beta2**t
v /= 1 - beta2**t
x = x - learning_rate * m / (np.sqrt(v) + 1e-7)
# LAMB update
def l2_norm(x):
return np.sqrt(np.sum(x**2))
beta1 = 0.9
beta2 = 0.995
for t in range(0, num_of_iterations):
#....
# whithin each layer
for l in range(0, num_layers):
dx = layer[l].backward()
m = beta1 * m + (1 - beta1) * dx # update first moment
v = beta2 * v + (1 - beta2) * (dx**2) # update second moment
m /= 1 - beta2**t
v /= 1 - beta2**t
update = m / (np.sqrt(v) + 1e-7) + weight_decay * x
r1 = l2_norm(x) #
r2 = l2_norm(update)
r = r1 / r2
learning_rate_l = np.cross(r, learning_rate) # cross product (векторное произведение
x = x - np.cross(learning_rate_l, (update))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment