Skip to content

Instantly share code, notes, and snippets.

@lsimmons2
Last active February 18, 2019 18:38
Show Gist options
  • Save lsimmons2/a5aa5c1579e810ee200974867b46e006 to your computer and use it in GitHub Desktop.
Save lsimmons2/a5aa5c1579e810ee200974867b46e006 to your computer and use it in GitHub Desktop.
# This is an implementation of a basic feedforward neural network that uses stochastic gradient
# descent as a learning algorithm. It's adapted from an original implementation [1] that is part
# of a great online book on deep learning by Michael Nielsen [2]. This is basically the same class
# as the original implementation, it just has more explicit variable and method
# names, slightly different control flow, and is more heavily commented. I made these changes
# to help myself learn about backpropagation and because I think this adapted version
# would be easier to understand for people new to backprop.
# In addition to the Michael Nielsen book I would recommend the tutorial by Andrej Karpathy on
# the math behind backpropagation[3] and the YouTube series by 3Blue1Brown on deep learning [4]
# to people trying to learn about gradient descent and backpropagation.
# [1] https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/src/network.py
# [2] http://neuralnetworksanddeeplearning.com/
# [3] http://karpathy.github.io/neuralnets/
# [4] https://www.youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi
import random
import numpy as np
class Network(object):
def __init__(self, layer_sizes):
self.num_layers = len(layer_sizes)
# initialize biases with random values - are list of numpy arrays,
# each holding the biases for the next (downstream) layer, each is therefore
# of shape (n,1) where n is the number of nodes in the downstream layer
self.biases = []
for layer_size in layer_sizes[1:]:
self.biases.append(np.random.randn(layer_size,1))
# initialize weights with random values - list of numpy arrays,
# each holding the weights for the current layer, each
# is of shape (n,m) where n is the number of nodes in the current
# layer and m is the number of nodes in previous (upstream) layer
self.weights = []
for upstream_layer_size, layer_size in zip(sizes[:-1], sizes[1:]):
new_weights = np.random.randn(layer_size, upstream_layer_size)
self.weights.append(new_weights)
def feedforward(self, feature):
# pass feature through the network,
# multiplying by weights and adding biases,
# doesn't store any activations of hidden layers,
# just returns activations of output layer
activations = feature
for arc_layer_i in xrange(self.num_layers-1):
weights = self.weights[arc_layer_i]
biases = self.biases[arc_layer_i]
zs = np.dot(weights, activations) + biases
activations = sigmoid(zs)
return activations
def SGD(self, training_data, number_of_epochs, minibatch_size, alpha, test_data=None):
# perform (minibatch) stochastic gradient descent - for every epoch, break up training data
# into random minibatches, and calculate the average gradient of the cost function
# wrt the networks weights for each minibatch, using the gradient to update the weights
num_training_samples = len(training_data)
for epoch_i in range(number_of_epochs):
random.shuffle(training_data)
minibatches = [training_data[k:k+minibatch_size]
for k in range(0,num_training_samples,minibatch_size)]
for minibatch in minibatches:
self.update_minibatch(minibatch, alpha)
if test_data:
print 'Epoch %d: %.2f accuracy' % (
epoch_i, self.validation_accuracy(test_data))
else:
print 'Epoch %d finished' % epoch_i
def update_minibatch(self, minibatch, alpha):
# initialize placholders for the gradient -
# (nabla is a greek symbol, an upside down capital alpha,
# used to refer to the gradient) - each list below contains
# numpy arrays representing the gradient for each layer of weights,
# and for each layer of biases, respectively
minibatch_nabla_w_sum = [np.zeros(w.shape) for w in self.weights]
minibatch_nabla_b_sum = [np.zeros(b.shape) for b in self.biases]
# go through each sample in minibatch, performing backprop to get the
# gradient of the loss function wrt weights and biases, summing them
# up in the placholders defined above
for feature,label in minibatch:
sample_nabla_w, sample_nabla_b = self.backprop(feature,label)
minibatch_nabla_w_sum = [snw+mbnw for snw,mbnw in zip(sample_nabla_w,minibatch_nabla_w_sum)]
minibatch_nabla_b_sum = [snb+mbnb for snb,mbnb in zip(sample_nabla_b,minibatch_nabla_b_sum)]
# divide sum of gradients for each sample by length
# of minibatch to get the mean gradient of the minibatch
minibatch_nabla_w_mean = np.divide(minibatch_nabla_w_sum, len(minibatch))
minibatch_nabla_b_mean = np.divide(minibatch_nabla_b_sum, len(minibatch))
# update weights with gradients of weights
new_weights = []
for current_weights, nabla_w in zip(self.weights, minibatch_nabla_w_mean):
new_weights.append(current_weights - alpha*nabla_w)
self.weights = new_weights
# update biases with gradients of biases
new_biases = []
for current_biases, gradient_wrt_biases in zip(self.biases, minibatch_nabla_b_mean):
new_biases.append(current_biases - alpha*gradient_wrt_biases)
self.biases = new_biases
def backprop(self, feature, label):
# Note - In diagrams of neural networks the edges/connections between layers
# of nodes/neurons are really where the weights and biases are located.
# Thinking about the network structure from this perspective it seems
# like each layer of weights and biases has an upstream layer of neurons and
# a downstream layer of neurons - a layer of weights is multiplied with the
# activations (values held by neurons) of its UPSTREAM neurons to yield the
# activations of its DOWNSTREAM neurons (after adding bias and getting put
# through activation function). I explicity use the terms "upstream"
# and "downstream" here because I think it makes things more clear when talking
# about backprop, but it does seem to be common terminology that when referring
# to a layer of weights, that that layer of weights is associated with the
# "downstream" layer of neurons, (that are produced when multiplying said layer
# of weights with its "upstream" layer of neurons).
# initialize weight and bias gradient placeholders -
# these are lists of numpy arrays that are the same shapes as
# self.weights and self.biases, respectively
gradient_wrt_weights = [np.zeros(w.shape) for w in self.weights]
gradient_wrt_biases = [np.zeros(b.shape) for b in self.biases]
# feedforward - not using feedforward() method b/c we need
# to keep track of activations (values of neurons) and
# zs (values of the inputs to sigmoid function calls)
# to use them in calculating gradients with the chain rule
activation = feature
activations = [activation]
zs = []
for weights,biases in zip(self.weights,self.biases):
z = np.dot(weights,activation)+biases
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
# backward pass - once feedforward is finished and neurons have their
# values (their activations), go backward calculating the gradient
# of the loss function with respect to (wrt) of all the weights
# and biases in the network
for l in xrange(1, self.num_layers):
# 1) get the gradient of the cost function wrt the downstream activations
if l == 1:
# if your dealing with the output layer weights,
# then gradient_wrt_downstream_activations is going to be the
# gradient of the cost function wrt the output of the network
# (output calculated in feedforward above)
gradient_wrt_downstream_activations = self.cost_gradient_wrt_output(activations[-1], label)
else:
# if you're dealing with any other layer of weights, then
# gradient_wrt_downstream_activations is going to be
# calculated by multiplying the gradient of the cost
# function wrt the downstream z values (gradient_wrt_downstream_z
# - calculated in the previous iteration of this for
# loop) by the downstream weights (chain rule)
downstream_weights = self.weights[-l+1]
gradient_wrt_downstream_activations = np.dot(downstream_weights.transpose(),
gradient_wrt_downstream_z)
# 2) get "local" gradient of the sigmoid function wrt z values
# z = (weights*activations)+bias ==> downstream_activations = sig(z)
z = zs[-l]
gradient_sig_wrt_z = sigmoid_prime(z)
# 3) get gradient of cost function wrt to the z values by multiplying
# gradient_wrt_downstream_activations with the local gradient of the
# sigmoid function wrt z (chain rule)
gradient_wrt_z = gradient_sig_wrt_z * gradient_wrt_downstream_activations
gradient_wrt_biases[-l] = gradient_wrt_z
# 4) calculate the gradient of the weights by multiplying the
# gradient wrt z values by the upstream activations (chain rule)
upstream_activations = activations[-l-1]
gradient_wrt_weights[-l] = np.dot(gradient_wrt_z, upstream_activations.transpose())
# 5) in next iteration of loop, gradient_wrt_downstream_z is going
# to be z values produced in this iteration of loop
gradient_wrt_downstream_z = gradient_wrt_z
return (gradient_wrt_weights, gradient_wrt_biases)
def validation_accuracy(self, test_data):
# simply see how accurate the model is on the validation data
correct_count = 0
for feature,label in test_data:
y_hat = np.argmax(self.feedforward(feature))
if y_hat == label:
correct_count += 1
return correct_count / float(len(test_data))
def cost_gradient_wrt_output(self, output_activations, label):
# this is just the difference between each activation in the
# network's ouput layer and the correct label that the
# activations are supposed to represent - a label representing
# number 2 would be a numpy array with values [0 0 1 0 0 0 0 0 0 0]
return (output_activations-label)
def sigmoid(z):
"""The sigmoid function."""
return 1.0/(1.0+np.exp(-z))
def sigmoid_prime(z):
"""Derivative of the sigmoid function."""
return sigmoid(z)*(1-sigmoid(z))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment