Skip to content

Instantly share code, notes, and snippets.

@redwrasse
Last active February 11, 2021 05:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save redwrasse/54ce1fb6731b9bd688647f5b5e1f5dfc to your computer and use it in GitHub Desktop.
Save redwrasse/54ce1fb6731b9bd688647f5b5e1f5dfc to your computer and use it in GitHub Desktop.
example backprop
"""
backprop algorithm on F[a] = xa^2 + a
For each node function f_i
- method to calculate value on input, f_i(x_i)
- method to calculate derivative value on input, K_ij
- method to calculate parameter derivative on input, xi_i
Suppose a single input x in R. Suppose the functional to be
optimized is F[a] = xa^2 + a
F'[a] = 2xa + 1
2xa + 1 = 0
=> a = -1 / (2x) at minimum
Hence for x = 4, algorithm should converge to a = - 1 / 8. = -0.125
Function composition graph. Assume elementary operations:
Add, Mult.
F[a] = add_a ( mult_a ( mult_a( (x))) ))
So graph is:
add_a
|
mult_a
|
mult_a
|
(input x)
The terms are
add_a
-----
ksi = 1
K_[this][child] = 1
mult_a
------
ksi = x
K_[this][child] = a
Output
~~~~~~
registering nodes ...
registered: child [multiplication node 2] of [multiplication node 1] at index 0.
registered: child [multiplication node 1] of [addition node] at index 0.
finished registering nodes.
registering param dependencies with nodes ...
built indirect param set [] for node [multiplication node 2]
built indirect param set ['a'] for node [multiplication node 1]
built indirect param set ['a'] for node [addition node]
finished registering params with nodes.
updated params: 'a': 0.991
updated params: 'a': -0.1246374763459712
updated params: 'a': -0.124999882237097
updated params: 'a': -0.12499999996174568
updated params: 'a': -0.12499999999998755
updated params: 'a': -0.12499999999999914
updated params: 'a': -0.12499999999999914
updated params: 'a': -0.12499999999999914
updated params: 'a': -0.12499999999999914
updated params: 'a': -0.12499999999999914
Note: this particular implementation is a little unorthodox,
treating the parameters not as leaves but as formal parameters attached
to corresponding nodes.
ref. https://www.redwrasse.io/notes/backpropagation
"""
import sys
def functional(x):
def func(a):
return x * a ** 2 + a
return func
def direct_gradient_descent():
"""
for comparison:
optimization by direct gradient descent on F,
computing parameter derivatives numerically
"""
x = 4
F = functional(x)
# initial parameter
a = 1.
# learning rate
eta = 1e-1
# number of gradient descent iterations
n_iters = 10 ** 4
eps = 0.001
# should converge to minimum at -1 / (2x)
# = -1 / 8 = -0.125
for i in range(n_iters):
dF_a = (F(a + eps) - F(a)) / eps
a += - eta * dF_a
if i % 10 ** 3 == 0:
print(f'i: {i} a: {a}')
assert abs(a - (-1 / 8.)) < 10e-4, "a did not converge to -1 / 8."
class NodeFunction(object):
def __init__(self, name, n_inputs):
self.name = name
self.n_inputs = n_inputs
self.children = []
self.is_complete = False
self.direct_params = set()
self.indirect_params = set()
def set_complete(self):
print('registering nodes ...')
self._register()
print('finished registering nodes.')
print('registering param dependencies with nodes ...')
self.build_indirect_params()
print('finished registering params with nodes.')
def all_params(self):
return self.direct_params.union(self.indirect_params)
def _add_indirect_param(self, param):
self.indirect_params.add(param)
def build_indirect_params(self):
for child in self.children:
child_indirect_params = child.build_indirect_params()
self.indirect_params = self.indirect_params.union(child_indirect_params)
print(f'built indirect param set {[_.name for _ in self.indirect_params]} for node [{self.name}]')
return self.direct_params.union(self.indirect_params)
def _register(self):
all_names = set()
for i, child in enumerate(self.children):
c_all_names = child._register()
if not all_names.isdisjoint(c_all_names):
print('error: duplicate node names. exiting')
sys.exit(0)
all_names = all_names.union(c_all_names)
print(f'registered: child [{child.name}] of [{self.name}] at index {i}.')
self.is_complete = True
return all_names
def evaluate(self, xi):
# value
pass
def derivative(self, xi, input_index):
# jacobian value at xi wrt input specified
# by input index
pass
def param_derivative(self, xi):
# direct param derivative
pass
def add_child(self, child):
self.children.append(child)
# get evaluation points by forward prop
# and store in dict forward_store
def forward_prop(self, leaf_x, forward_store):
if not self.is_complete:
print('error: graph not complete. exiting')
sys.exit(0)
if not self.children:
forward_store[self.name] = leaf_x
return self.evaluate(leaf_x)
else:
xi = []
for child in self.children:
xc = child.forward_prop(leaf_x, forward_store)
xi.append(xc)
if len(xi) == 1:
forward_store[self.name] = xi[0]
return self.evaluate(xi[0])
forward_store[self.name] = xi
return [self.evaluate(xc) for xc in xi]
def _compute_beta(self, param, forward_store, ksi_store,
k_store):
beta = 0.
xi = forward_store[self.name]
# cache lookup
if self.name in ksi_store:
ksi = ksi_store[self.name]
else:
ksi = self.param_derivative(xi)
ksi_store[self.name] = ksi
beta += ksi
#print(f'ksi for node [{self.name}]: {ksi}')
for input_index, child in enumerate(self.children):
if param in child.all_params():
# cache lookup
if child.name in k_store:
k = k_store[child.name]
else:
k = self.derivative(xi, input_index)
k_store[child.name] = k
#print(f'k[{self.name}][{child.name}]: {k}')
beta_c = child._compute_beta(param, forward_store,
ksi_store, k_store)
beta += k * beta_c
#print(f'beta for node [{self.name}]: {beta}')
return beta
def backward_prop(self, param, forward_store):
#print(f"running backprop from root node [{self.name}] on param '{param.name}' ... ")
if not self.is_complete:
print('error: graph not complete. exiting')
sys.exit(0)
#print('finished backprop.')
ksi_store = {}
k_store = {}
return self._compute_beta(param, forward_store,
ksi_store,
k_store)
class Add(NodeFunction):
def __init__(self, name, param):
super().__init__(name, n_inputs=1)
self.param = param
self.direct_params.add(param)
def evaluate(self, xi):
return xi + self.param.value
def derivative(self, xi, input_index):
return 1.
def param_derivative(self, xi):
return 1.
class Mult(NodeFunction):
def __init__(self, name, param):
super().__init__(name, n_inputs=1)
self.param = param
self.direct_params.add(param)
def evaluate(self, xi):
return xi * self.param.value
def derivative(self, xi, input_index):
return self.param.value
def param_derivative(self, xi):
return xi
class Param(object):
def __init__(self, name, init_value):
self.name = name
self.value = init_value
def get_value(self):
return self.value
def set_value(self, value):
self.value = value
def sample_graph():
# build computation graph
params = []
a = Param('a', 1.)
params.append(a)
add = Add('addition node', a)
mult = Mult('multiplication node 1', a)
mult2 = Mult('multiplication node 2', a)
mult.add_child(mult2)
add.add_child(mult)
graph = add
graph.set_complete()
return params, graph
def run_forwardprop_iter(graph, x):
forward_store = {}
graph.forward_prop(x, forward_store)
return forward_store
def run_backprop_iter(graph, param, forward_store):
# run backprop on the given parameter
return graph.backward_prop(param, forward_store)
def run_backprop_algorithm():
# build graph and initial parameter values
params, graph = sample_graph()
# learning rate
eta = 1e-3
# number of iterations
n_iter = 10**4
for i in range(n_iter):
run_iteration(i, params, graph)
a = params[0].value
assert abs(a - (-1 / 8.)) < 10e-4, "a did not converge to -1 / 8."
def run_iteration(i, params, graph):
# run forward_prop iteration
x = 4.
forward_store = run_forwardprop_iter(graph, x)
# run backward prop iteration
eta = 1e-3
for param in params:
del_p = run_backprop_iter(graph, param, forward_store)
param.set_value(param.get_value() - eta * del_p)
if i % 1000 == 0:
print(f"updated params: '{param.name}': {param.value}")
if __name__ == "__main__":
# direct_gradient_descent()
run_backprop_algorithm()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment