Last active
February 18, 2019 18:38
-
-
Save lsimmons2/a5aa5c1579e810ee200974867b46e006 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is an implementation of a basic feedforward neural network that uses stochastic gradient | |
# descent as a learning algorithm. It's adapted from an original implementation [1] that is part | |
# of a great online book on deep learning by Michael Nielsen [2]. This is basically the same class | |
# as the original implementation, it just has more explicit variable and method | |
# names, slightly different control flow, and is more heavily commented. I made these changes | |
# to help myself learn about backpropagation and because I think this adapted version | |
# would be easier to understand for people new to backprop. | |
# In addition to the Michael Nielsen book I would recommend the tutorial by Andrej Karpathy on | |
# the math behind backpropagation[3] and the YouTube series by 3Blue1Brown on deep learning [4] | |
# to people trying to learn about gradient descent and backpropagation. | |
# [1] https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/src/network.py | |
# [2] http://neuralnetworksanddeeplearning.com/ | |
# [3] http://karpathy.github.io/neuralnets/ | |
# [4] https://www.youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi | |
import random | |
import numpy as np | |
class Network(object): | |
def __init__(self, layer_sizes): | |
self.num_layers = len(layer_sizes) | |
# initialize biases with random values - are list of numpy arrays, | |
# each holding the biases for the next (downstream) layer, each is therefore | |
# of shape (n,1) where n is the number of nodes in the downstream layer | |
self.biases = [] | |
for layer_size in layer_sizes[1:]: | |
self.biases.append(np.random.randn(layer_size,1)) | |
# initialize weights with random values - list of numpy arrays, | |
# each holding the weights for the current layer, each | |
# is of shape (n,m) where n is the number of nodes in the current | |
# layer and m is the number of nodes in previous (upstream) layer | |
self.weights = [] | |
for upstream_layer_size, layer_size in zip(sizes[:-1], sizes[1:]): | |
new_weights = np.random.randn(layer_size, upstream_layer_size) | |
self.weights.append(new_weights) | |
def feedforward(self, feature): | |
# pass feature through the network, | |
# multiplying by weights and adding biases, | |
# doesn't store any activations of hidden layers, | |
# just returns activations of output layer | |
activations = feature | |
for arc_layer_i in xrange(self.num_layers-1): | |
weights = self.weights[arc_layer_i] | |
biases = self.biases[arc_layer_i] | |
zs = np.dot(weights, activations) + biases | |
activations = sigmoid(zs) | |
return activations | |
def SGD(self, training_data, number_of_epochs, minibatch_size, alpha, test_data=None): | |
# perform (minibatch) stochastic gradient descent - for every epoch, break up training data | |
# into random minibatches, and calculate the average gradient of the cost function | |
# wrt the networks weights for each minibatch, using the gradient to update the weights | |
num_training_samples = len(training_data) | |
for epoch_i in range(number_of_epochs): | |
random.shuffle(training_data) | |
minibatches = [training_data[k:k+minibatch_size] | |
for k in range(0,num_training_samples,minibatch_size)] | |
for minibatch in minibatches: | |
self.update_minibatch(minibatch, alpha) | |
if test_data: | |
print 'Epoch %d: %.2f accuracy' % ( | |
epoch_i, self.validation_accuracy(test_data)) | |
else: | |
print 'Epoch %d finished' % epoch_i | |
def update_minibatch(self, minibatch, alpha): | |
# initialize placholders for the gradient - | |
# (nabla is a greek symbol, an upside down capital alpha, | |
# used to refer to the gradient) - each list below contains | |
# numpy arrays representing the gradient for each layer of weights, | |
# and for each layer of biases, respectively | |
minibatch_nabla_w_sum = [np.zeros(w.shape) for w in self.weights] | |
minibatch_nabla_b_sum = [np.zeros(b.shape) for b in self.biases] | |
# go through each sample in minibatch, performing backprop to get the | |
# gradient of the loss function wrt weights and biases, summing them | |
# up in the placholders defined above | |
for feature,label in minibatch: | |
sample_nabla_w, sample_nabla_b = self.backprop(feature,label) | |
minibatch_nabla_w_sum = [snw+mbnw for snw,mbnw in zip(sample_nabla_w,minibatch_nabla_w_sum)] | |
minibatch_nabla_b_sum = [snb+mbnb for snb,mbnb in zip(sample_nabla_b,minibatch_nabla_b_sum)] | |
# divide sum of gradients for each sample by length | |
# of minibatch to get the mean gradient of the minibatch | |
minibatch_nabla_w_mean = np.divide(minibatch_nabla_w_sum, len(minibatch)) | |
minibatch_nabla_b_mean = np.divide(minibatch_nabla_b_sum, len(minibatch)) | |
# update weights with gradients of weights | |
new_weights = [] | |
for current_weights, nabla_w in zip(self.weights, minibatch_nabla_w_mean): | |
new_weights.append(current_weights - alpha*nabla_w) | |
self.weights = new_weights | |
# update biases with gradients of biases | |
new_biases = [] | |
for current_biases, gradient_wrt_biases in zip(self.biases, minibatch_nabla_b_mean): | |
new_biases.append(current_biases - alpha*gradient_wrt_biases) | |
self.biases = new_biases | |
def backprop(self, feature, label): | |
# Note - In diagrams of neural networks the edges/connections between layers | |
# of nodes/neurons are really where the weights and biases are located. | |
# Thinking about the network structure from this perspective it seems | |
# like each layer of weights and biases has an upstream layer of neurons and | |
# a downstream layer of neurons - a layer of weights is multiplied with the | |
# activations (values held by neurons) of its UPSTREAM neurons to yield the | |
# activations of its DOWNSTREAM neurons (after adding bias and getting put | |
# through activation function). I explicity use the terms "upstream" | |
# and "downstream" here because I think it makes things more clear when talking | |
# about backprop, but it does seem to be common terminology that when referring | |
# to a layer of weights, that that layer of weights is associated with the | |
# "downstream" layer of neurons, (that are produced when multiplying said layer | |
# of weights with its "upstream" layer of neurons). | |
# initialize weight and bias gradient placeholders - | |
# these are lists of numpy arrays that are the same shapes as | |
# self.weights and self.biases, respectively | |
gradient_wrt_weights = [np.zeros(w.shape) for w in self.weights] | |
gradient_wrt_biases = [np.zeros(b.shape) for b in self.biases] | |
# feedforward - not using feedforward() method b/c we need | |
# to keep track of activations (values of neurons) and | |
# zs (values of the inputs to sigmoid function calls) | |
# to use them in calculating gradients with the chain rule | |
activation = feature | |
activations = [activation] | |
zs = [] | |
for weights,biases in zip(self.weights,self.biases): | |
z = np.dot(weights,activation)+biases | |
zs.append(z) | |
activation = sigmoid(z) | |
activations.append(activation) | |
# backward pass - once feedforward is finished and neurons have their | |
# values (their activations), go backward calculating the gradient | |
# of the loss function with respect to (wrt) of all the weights | |
# and biases in the network | |
for l in xrange(1, self.num_layers): | |
# 1) get the gradient of the cost function wrt the downstream activations | |
if l == 1: | |
# if your dealing with the output layer weights, | |
# then gradient_wrt_downstream_activations is going to be the | |
# gradient of the cost function wrt the output of the network | |
# (output calculated in feedforward above) | |
gradient_wrt_downstream_activations = self.cost_gradient_wrt_output(activations[-1], label) | |
else: | |
# if you're dealing with any other layer of weights, then | |
# gradient_wrt_downstream_activations is going to be | |
# calculated by multiplying the gradient of the cost | |
# function wrt the downstream z values (gradient_wrt_downstream_z | |
# - calculated in the previous iteration of this for | |
# loop) by the downstream weights (chain rule) | |
downstream_weights = self.weights[-l+1] | |
gradient_wrt_downstream_activations = np.dot(downstream_weights.transpose(), | |
gradient_wrt_downstream_z) | |
# 2) get "local" gradient of the sigmoid function wrt z values | |
# z = (weights*activations)+bias ==> downstream_activations = sig(z) | |
z = zs[-l] | |
gradient_sig_wrt_z = sigmoid_prime(z) | |
# 3) get gradient of cost function wrt to the z values by multiplying | |
# gradient_wrt_downstream_activations with the local gradient of the | |
# sigmoid function wrt z (chain rule) | |
gradient_wrt_z = gradient_sig_wrt_z * gradient_wrt_downstream_activations | |
gradient_wrt_biases[-l] = gradient_wrt_z | |
# 4) calculate the gradient of the weights by multiplying the | |
# gradient wrt z values by the upstream activations (chain rule) | |
upstream_activations = activations[-l-1] | |
gradient_wrt_weights[-l] = np.dot(gradient_wrt_z, upstream_activations.transpose()) | |
# 5) in next iteration of loop, gradient_wrt_downstream_z is going | |
# to be z values produced in this iteration of loop | |
gradient_wrt_downstream_z = gradient_wrt_z | |
return (gradient_wrt_weights, gradient_wrt_biases) | |
def validation_accuracy(self, test_data): | |
# simply see how accurate the model is on the validation data | |
correct_count = 0 | |
for feature,label in test_data: | |
y_hat = np.argmax(self.feedforward(feature)) | |
if y_hat == label: | |
correct_count += 1 | |
return correct_count / float(len(test_data)) | |
def cost_gradient_wrt_output(self, output_activations, label): | |
# this is just the difference between each activation in the | |
# network's ouput layer and the correct label that the | |
# activations are supposed to represent - a label representing | |
# number 2 would be a numpy array with values [0 0 1 0 0 0 0 0 0 0] | |
return (output_activations-label) | |
def sigmoid(z): | |
"""The sigmoid function.""" | |
return 1.0/(1.0+np.exp(-z)) | |
def sigmoid_prime(z): | |
"""Derivative of the sigmoid function.""" | |
return sigmoid(z)*(1-sigmoid(z)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment