lsimmons2/nielsen_neural_network_adapted.py

## nielsen_neural_network_adapted.py
# This is an implementation of a basic feedforward neural network that uses stochastic gradient
# descent as a learning algorithm. It's adapted from an original implementation [1] that is part
# of a great online book on deep learning by Michael Nielsen [2]. This is basically the same class
# as the original implementation, it just has more explicit variable and method
# names, slightly different control flow, and is more heavily commented. I made these changes
# to help myself learn about backpropagation and because I think this adapted version
# would be easier to understand for people new to backprop.

# In addition to the Michael Nielsen book I would recommend the tutorial by Andrej Karpathy on
# the math behind backpropagation[3] and the YouTube series by 3Blue1Brown on deep learning [4]
# to people trying to learn about gradient descent and backpropagation.

# [1] https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/src/network.py
# [2] http://neuralnetworksanddeeplearning.com/
# [3] http://karpathy.github.io/neuralnets/
# [4] https://www.youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi


import random
import numpy as np


class Network(object):


    def __init__(self, layer_sizes):
        self.num_layers = len(layer_sizes)

        # initialize biases with random values - are list of numpy arrays,
        # each holding the biases for the next (downstream) layer, each is therefore
        # of shape (n,1) where n is the number of nodes in the downstream layer
        self.biases = []
        for layer_size in layer_sizes[1:]:
            self.biases.append(np.random.randn(layer_size,1))

        # initialize weights with random values - list of numpy arrays,
        # each holding the weights for the current layer, each
        # is of shape (n,m) where n is the number of nodes in the current
        # layer and m is the number of nodes in previous (upstream) layer
        self.weights = []
        for upstream_layer_size, layer_size in zip(sizes[:-1], sizes[1:]):
            new_weights = np.random.randn(layer_size, upstream_layer_size)
            self.weights.append(new_weights)


    def feedforward(self, feature):
        # pass feature through the network,
        # multiplying by weights and adding biases,
        # doesn't store any activations of hidden layers,
        # just returns activations of output layer
        activations = feature
        for arc_layer_i in xrange(self.num_layers-1):
            weights = self.weights[arc_layer_i]
            biases = self.biases[arc_layer_i]
            zs = np.dot(weights, activations) + biases
            activations = sigmoid(zs)
        return activations


    def SGD(self, training_data, number_of_epochs, minibatch_size, alpha, test_data=None):
        # perform (minibatch) stochastic gradient descent - for every epoch, break up training data
        # into random minibatches, and calculate the average gradient of the cost function
        # wrt the networks weights for each minibatch, using the gradient to update the weights
        num_training_samples = len(training_data)
        for epoch_i in range(number_of_epochs):
            random.shuffle(training_data)
            minibatches = [training_data[k:k+minibatch_size]
                           for k in range(0,num_training_samples,minibatch_size)]
            for minibatch in minibatches:
                self.update_minibatch(minibatch, alpha)
            if test_data:
                print 'Epoch %d: %.2f accuracy' % (
                    epoch_i, self.validation_accuracy(test_data))
            else:
                print 'Epoch %d finished' % epoch_i


    def update_minibatch(self, minibatch, alpha):


        # initialize placholders for the gradient -
        # (nabla is a greek symbol, an upside down capital alpha,
        # used to refer to the gradient) - each list below contains
        # numpy arrays representing the gradient for each layer of weights,
        # and for each layer of biases, respectively
        minibatch_nabla_w_sum = [np.zeros(w.shape) for w in self.weights]
        minibatch_nabla_b_sum = [np.zeros(b.shape) for b in self.biases]

        # go through each sample in minibatch, performing backprop to get the
        # gradient of the loss function wrt weights and biases, summing them
        # up in the placholders defined above
        for feature,label in minibatch:
            sample_nabla_w, sample_nabla_b = self.backprop(feature,label)
            minibatch_nabla_w_sum = [snw+mbnw for snw,mbnw in zip(sample_nabla_w,minibatch_nabla_w_sum)]
            minibatch_nabla_b_sum = [snb+mbnb for snb,mbnb in zip(sample_nabla_b,minibatch_nabla_b_sum)]

        # divide sum of gradients for each sample by length
        # of minibatch to get the mean gradient of the minibatch
        minibatch_nabla_w_mean = np.divide(minibatch_nabla_w_sum, len(minibatch))
        minibatch_nabla_b_mean = np.divide(minibatch_nabla_b_sum, len(minibatch))

        # update weights with gradients of weights
        new_weights = []
        for current_weights, nabla_w in zip(self.weights, minibatch_nabla_w_mean):
            new_weights.append(current_weights - alpha*nabla_w)
        self.weights = new_weights

        # update biases with gradients of biases
        new_biases = []
        for current_biases, gradient_wrt_biases in zip(self.biases, minibatch_nabla_b_mean):
            new_biases.append(current_biases - alpha*gradient_wrt_biases)
        self.biases = new_biases


    def backprop(self, feature, label):

        # Note - In diagrams of neural networks the edges/connections between layers
        # of nodes/neurons are really where the weights and biases are located.
        # Thinking about the network structure from this perspective it seems
        # like each layer of weights and biases has an upstream layer of neurons and
        # a downstream layer of neurons - a layer of weights is multiplied with the
        # activations (values held by neurons) of its UPSTREAM neurons to yield the
        # activations of its DOWNSTREAM neurons (after adding bias and getting put
        # through activation function). I explicity use the terms "upstream"
        # and "downstream" here because I think it makes things more clear when talking
        # about backprop, but it does seem to be common terminology that when referring
        # to a layer of weights, that that layer of weights is associated with the
        # "downstream" layer of neurons, (that are produced when multiplying said layer
        # of weights with its "upstream" layer of neurons).

        # initialize weight and bias gradient placeholders -
        # these are lists of numpy arrays that are the same shapes as
        # self.weights and self.biases, respectively
        gradient_wrt_weights = [np.zeros(w.shape) for w in self.weights]
        gradient_wrt_biases = [np.zeros(b.shape) for b in self.biases]

        # feedforward - not using feedforward() method b/c we need
        # to keep track of activations (values of neurons) and
        # zs (values of the inputs to sigmoid function calls)
        # to use them in calculating gradients with the chain rule
        activation = feature
        activations = [activation]
        zs = []
        for weights,biases in zip(self.weights,self.biases):
            z = np.dot(weights,activation)+biases
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)

        # backward pass - once feedforward is finished and neurons have their
        # values (their activations), go backward calculating the gradient
        # of the loss function with respect to (wrt) of all the weights
        # and biases in the network
        for l in xrange(1, self.num_layers):

            # 1) get the gradient of the cost function wrt the downstream activations
            if l == 1:
                # if your dealing with the output layer weights,
                # then gradient_wrt_downstream_activations is going to be the
                # gradient of the cost function wrt the output of the network
                # (output calculated in feedforward above)
                gradient_wrt_downstream_activations = self.cost_gradient_wrt_output(activations[-1], label)
            else:
                # if you're dealing with any other layer of weights, then
                # gradient_wrt_downstream_activations is going to be
                # calculated by multiplying the gradient of the cost
                # function wrt the downstream z values (gradient_wrt_downstream_z
                # - calculated in the previous iteration of this for
                # loop) by the downstream weights (chain rule)
                downstream_weights = self.weights[-l+1]
                gradient_wrt_downstream_activations = np.dot(downstream_weights.transpose(),
                                                             gradient_wrt_downstream_z)

            # 2) get "local" gradient of the sigmoid function wrt z values
            # z = (weights*activations)+bias ==> downstream_activations = sig(z)
            z = zs[-l]
            gradient_sig_wrt_z = sigmoid_prime(z)

            # 3) get gradient of cost function wrt to the z values by multiplying
            # gradient_wrt_downstream_activations with the local gradient of the
            # sigmoid function wrt z (chain rule)
            gradient_wrt_z = gradient_sig_wrt_z * gradient_wrt_downstream_activations
            gradient_wrt_biases[-l] = gradient_wrt_z

            # 4) calculate the gradient of the weights by multiplying the
            # gradient wrt z values by the upstream activations (chain rule)
            upstream_activations = activations[-l-1]
            gradient_wrt_weights[-l] = np.dot(gradient_wrt_z, upstream_activations.transpose())

            # 5) in next iteration of loop, gradient_wrt_downstream_z is going
            # to be z values produced in this iteration of loop
            gradient_wrt_downstream_z = gradient_wrt_z

        return (gradient_wrt_weights, gradient_wrt_biases)


    def validation_accuracy(self, test_data):
        # simply see how accurate the model is on the validation data
        correct_count = 0
        for feature,label in test_data:
            y_hat = np.argmax(self.feedforward(feature))
            if y_hat == label:
                correct_count += 1
        return correct_count / float(len(test_data))

    def cost_gradient_wrt_output(self, output_activations, label):
        # this is just the difference between each activation in the
        # network's ouput layer and the correct label that the
        # activations are supposed to represent - a label representing
        # number 2 would be a numpy array with values [0 0 1 0 0 0 0 0 0 0]
        return (output_activations-label)


def sigmoid(z):
    """The sigmoid function."""
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    """Derivative of the sigmoid function."""
    return sigmoid(z)*(1-sigmoid(z))
	# This is an implementation of a basic feedforward neural network that uses stochastic gradient
	# descent as a learning algorithm. It's adapted from an original implementation [1] that is part
	# of a great online book on deep learning by Michael Nielsen [2]. This is basically the same class
	# as the original implementation, it just has more explicit variable and method
	# names, slightly different control flow, and is more heavily commented. I made these changes
	# to help myself learn about backpropagation and because I think this adapted version
	# would be easier to understand for people new to backprop.

	# In addition to the Michael Nielsen book I would recommend the tutorial by Andrej Karpathy on
	# the math behind backpropagation[3] and the YouTube series by 3Blue1Brown on deep learning [4]
	# to people trying to learn about gradient descent and backpropagation.

	# [1] https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/src/network.py
	# [2] http://neuralnetworksanddeeplearning.com/
	# [3] http://karpathy.github.io/neuralnets/
	# [4] https://www.youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi




	import random
	import numpy as np


	class Network(object):


	def __init__(self, layer_sizes):
	self.num_layers = len(layer_sizes)

	# initialize biases with random values - are list of numpy arrays,
	# each holding the biases for the next (downstream) layer, each is therefore
	# of shape (n,1) where n is the number of nodes in the downstream layer
	self.biases = []
	for layer_size in layer_sizes[1:]:
	self.biases.append(np.random.randn(layer_size,1))

	# initialize weights with random values - list of numpy arrays,
	# each holding the weights for the current layer, each
	# is of shape (n,m) where n is the number of nodes in the current
	# layer and m is the number of nodes in previous (upstream) layer
	self.weights = []
	for upstream_layer_size, layer_size in zip(sizes[:-1], sizes[1:]):
	new_weights = np.random.randn(layer_size, upstream_layer_size)
	self.weights.append(new_weights)


	def feedforward(self, feature):
	# pass feature through the network,
	# multiplying by weights and adding biases,
	# doesn't store any activations of hidden layers,
	# just returns activations of output layer
	activations = feature
	for arc_layer_i in xrange(self.num_layers-1):
	weights = self.weights[arc_layer_i]
	biases = self.biases[arc_layer_i]
	zs = np.dot(weights, activations) + biases
	activations = sigmoid(zs)
	return activations


	def SGD(self, training_data, number_of_epochs, minibatch_size, alpha, test_data=None):
	# perform (minibatch) stochastic gradient descent - for every epoch, break up training data
	# into random minibatches, and calculate the average gradient of the cost function
	# wrt the networks weights for each minibatch, using the gradient to update the weights
	num_training_samples = len(training_data)
	for epoch_i in range(number_of_epochs):
	random.shuffle(training_data)
	minibatches = [training_data[k:k+minibatch_size]
	for k in range(0,num_training_samples,minibatch_size)]
	for minibatch in minibatches:
	self.update_minibatch(minibatch, alpha)
	if test_data:
	print 'Epoch %d: %.2f accuracy' % (
	epoch_i, self.validation_accuracy(test_data))
	else:
	print 'Epoch %d finished' % epoch_i


	def update_minibatch(self, minibatch, alpha):


	# initialize placholders for the gradient -
	# (nabla is a greek symbol, an upside down capital alpha,
	# used to refer to the gradient) - each list below contains
	# numpy arrays representing the gradient for each layer of weights,
	# and for each layer of biases, respectively
	minibatch_nabla_w_sum = [np.zeros(w.shape) for w in self.weights]
	minibatch_nabla_b_sum = [np.zeros(b.shape) for b in self.biases]

	# go through each sample in minibatch, performing backprop to get the
	# gradient of the loss function wrt weights and biases, summing them
	# up in the placholders defined above
	for feature,label in minibatch:
	sample_nabla_w, sample_nabla_b = self.backprop(feature,label)
	minibatch_nabla_w_sum = [snw+mbnw for snw,mbnw in zip(sample_nabla_w,minibatch_nabla_w_sum)]
	minibatch_nabla_b_sum = [snb+mbnb for snb,mbnb in zip(sample_nabla_b,minibatch_nabla_b_sum)]

	# divide sum of gradients for each sample by length
	# of minibatch to get the mean gradient of the minibatch
	minibatch_nabla_w_mean = np.divide(minibatch_nabla_w_sum, len(minibatch))
	minibatch_nabla_b_mean = np.divide(minibatch_nabla_b_sum, len(minibatch))

	# update weights with gradients of weights
	new_weights = []
	for current_weights, nabla_w in zip(self.weights, minibatch_nabla_w_mean):
	new_weights.append(current_weights - alpha*nabla_w)
	self.weights = new_weights

	# update biases with gradients of biases
	new_biases = []
	for current_biases, gradient_wrt_biases in zip(self.biases, minibatch_nabla_b_mean):
	new_biases.append(current_biases - alpha*gradient_wrt_biases)
	self.biases = new_biases


	def backprop(self, feature, label):

	# Note - In diagrams of neural networks the edges/connections between layers
	# of nodes/neurons are really where the weights and biases are located.
	# Thinking about the network structure from this perspective it seems
	# like each layer of weights and biases has an upstream layer of neurons and
	# a downstream layer of neurons - a layer of weights is multiplied with the
	# activations (values held by neurons) of its UPSTREAM neurons to yield the
	# activations of its DOWNSTREAM neurons (after adding bias and getting put
	# through activation function). I explicity use the terms "upstream"
	# and "downstream" here because I think it makes things more clear when talking
	# about backprop, but it does seem to be common terminology that when referring
	# to a layer of weights, that that layer of weights is associated with the
	# "downstream" layer of neurons, (that are produced when multiplying said layer
	# of weights with its "upstream" layer of neurons).

	# initialize weight and bias gradient placeholders -
	# these are lists of numpy arrays that are the same shapes as
	# self.weights and self.biases, respectively
	gradient_wrt_weights = [np.zeros(w.shape) for w in self.weights]
	gradient_wrt_biases = [np.zeros(b.shape) for b in self.biases]

	# feedforward - not using feedforward() method b/c we need
	# to keep track of activations (values of neurons) and
	# zs (values of the inputs to sigmoid function calls)
	# to use them in calculating gradients with the chain rule
	activation = feature
	activations = [activation]
	zs = []
	for weights,biases in zip(self.weights,self.biases):
	z = np.dot(weights,activation)+biases
	zs.append(z)
	activation = sigmoid(z)
	activations.append(activation)

	# backward pass - once feedforward is finished and neurons have their
	# values (their activations), go backward calculating the gradient
	# of the loss function with respect to (wrt) of all the weights
	# and biases in the network
	for l in xrange(1, self.num_layers):

	# 1) get the gradient of the cost function wrt the downstream activations
	if l == 1:
	# if your dealing with the output layer weights,
	# then gradient_wrt_downstream_activations is going to be the
	# gradient of the cost function wrt the output of the network
	# (output calculated in feedforward above)
	gradient_wrt_downstream_activations = self.cost_gradient_wrt_output(activations[-1], label)
	else:
	# if you're dealing with any other layer of weights, then
	# gradient_wrt_downstream_activations is going to be
	# calculated by multiplying the gradient of the cost
	# function wrt the downstream z values (gradient_wrt_downstream_z
	# - calculated in the previous iteration of this for
	# loop) by the downstream weights (chain rule)
	downstream_weights = self.weights[-l+1]
	gradient_wrt_downstream_activations = np.dot(downstream_weights.transpose(),
	gradient_wrt_downstream_z)

	# 2) get "local" gradient of the sigmoid function wrt z values
	# z = (weights*activations)+bias ==> downstream_activations = sig(z)
	z = zs[-l]
	gradient_sig_wrt_z = sigmoid_prime(z)

	# 3) get gradient of cost function wrt to the z values by multiplying
	# gradient_wrt_downstream_activations with the local gradient of the
	# sigmoid function wrt z (chain rule)
	gradient_wrt_z = gradient_sig_wrt_z * gradient_wrt_downstream_activations
	gradient_wrt_biases[-l] = gradient_wrt_z

	# 4) calculate the gradient of the weights by multiplying the
	# gradient wrt z values by the upstream activations (chain rule)
	upstream_activations = activations[-l-1]
	gradient_wrt_weights[-l] = np.dot(gradient_wrt_z, upstream_activations.transpose())

	# 5) in next iteration of loop, gradient_wrt_downstream_z is going
	# to be z values produced in this iteration of loop
	gradient_wrt_downstream_z = gradient_wrt_z

	return (gradient_wrt_weights, gradient_wrt_biases)


	def validation_accuracy(self, test_data):
	# simply see how accurate the model is on the validation data
	correct_count = 0
	for feature,label in test_data:
	y_hat = np.argmax(self.feedforward(feature))
	if y_hat == label:
	correct_count += 1
	return correct_count / float(len(test_data))

	def cost_gradient_wrt_output(self, output_activations, label):
	# this is just the difference between each activation in the
	# network's ouput layer and the correct label that the
	# activations are supposed to represent - a label representing
	# number 2 would be a numpy array with values [0 0 1 0 0 0 0 0 0 0]
	return (output_activations-label)


	def sigmoid(z):
	"""The sigmoid function."""
	return 1.0/(1.0+np.exp(-z))

	def sigmoid_prime(z):
	"""Derivative of the sigmoid function."""
	return sigmoid(z)*(1-sigmoid(z))