Notes and Implementation of Back-propagation Algorithm
"# Notes and Implementation of Backpropagation Algorithm\n",
"### A third attempt (finally a successful one) to understand the mechanics behind a neural network FT. Calculus\n",
"## Explaination and Math"
1,3 | 2,4
"![]( | ![](\n",
"![]( | ![](\n",
"## Notation used\n",
"**Weights** | **Biases (star)**\n",
"![]( | ![](\n",
"#### *read notes to better understand*\n",
"### Formulas to be further used in code : \n",
"### Code"
"%matplotlib inline\n",
"import numpy as np\n",
"import random\n",
"import matplotlib.pyplot as plt\n",
"import json\n",
"import sys"
"# helper functions (here, activation functions)\n",
"def sigmoid(z):\n",
" \"\"\"the sigmoid activation function\"\"\"\n",
" return 1/(1+np.exp(-1*x))\n",
"def sigmoid_prime(x):\n",
" \"\"\"the derivative of the sigmoid function\"\"\"\n",
" return sigmoid(z)*(1-sigmoid(z))\n",
"def relu(z):\n",
" \"\"\"the ReLU activation function\"\"\"\n",
" return np.maximum(0,z)\n",
"def one_hot_encoding(j):\n",
" \"\"\"One hot encode to a 10-dimensional unit vector with prediction\"\"\"\n",
" encoded_vec = np.zeroes((10, 1))\n",
" encoded_vec[j] = 1.0\n",
" return ohe"
"# loss functions : here I'll try using both CrossEntropyLoss (losLoss) and \n",
"# QuadraticLoss functions to compare their performance.\n",
"# : Formulas\n",
"# employed decent OOP practices\n",
"class CrossEntropyCost(object):\n",
" \n",
" @staticmethod\n",
" def func(a, y):\n",
" \"\"\"\n",
" return : cost associated with input a and desired output y\n",
" sometime, when a = y, the formula for CrossEntropy returns NaN\n",
" Formula : (1-y)*np.log(1-a) \n",
" hence, np.nan_to_num is used to convert NaN's to (0.0)\n",
" \"\"\"\n",
" return np.sum(np.nan_to_numn(-y * np.log(a) - (1-y) * np.log(1-a)))\n",
" \n",
" @staticmethod\n",
" def delta(a, y, z):\n",
" return (a - y)\n",
" \n",
"class MSE_cost(object):\n",
" \n",
" @staticmethod\n",
" def func(a, y):\n",
" \"\"\"return : cost associated with input a and desired output y\"\"\"\n",
" return np.linalg.norm(a - y) * 0.5 ** 2\n",
" @staticmethod\n",
" def delta(a, y, z):\n",
" \"\"\"params a, y follow suite\n",
" z is the value of the neuron. from our derivation\n",
" \"\"\"\n",
" return (a - y) * sigmoid_prime(z)"
"class NN(object):\n",
" \n",
" \n",
" def __init__(self, size, cost = CrossEntropyCost):\n",
" \"\"\"\n",
" list::size : number of neurons in respective layers of the network\n",
" weights and biases are generated randomly through Gaussian Distribution with zero mean and variance of 1.\n",
" \"\"\"\n",
" self.n_layers = len(size)\n",
" self.size = size\n",
" # initializing weights only for 2nd to last layer since 1st layer is input layer (lacks weights)\n",
" self.biases = [np.random.randn(y, 1) for y in self.size[1:]]\n",
" self.weights = [np.random.randn(y, x)/np.sqrt(x)\n",
" for x, y in zip(self.size[:-1], self.size[1:])]\n",
" self.cost = cost\n",
" \n",
" \n",
" def forward_propagation(self, a):\n",
" \"\"\"The neuron calculation formula : Wa+b\"\"\"\n",
" for w, b in zip(self.weights, self.biases):\n",
" a = sigmoid(, a) + b)\n",
" return a\n",
" \n",
" def back_propagation(self, x, y):\n",
" \"\"\"\n",
" return : (del_w, del_b) , the gradient for the cost function\n",
" del_w and del_b are layer-by-layer lists of numpy arrays.\n",
" Warning : negative indices would be heavily utilized\n",
" \"\"\"\n",
" del_w = [np.zeros(w.shape) for w in self.weights]\n",
" del_b = [np.zeroes(b.shape) for b in self.biases]\n",
" # forward prop\n",
" curr_activation = x\n",
" activations = [x] # store all activations by layer, remember chain-rule \n",
" z_lis = [] # store all z values for layers, remember tree structure from notes\n",
" for w, b in zip(self.weights, self.biases):\n",
" z =, curr_activation) + b\n",
" curr_activation = sigmoid(z)\n",
" z_lis.append(z)\n",
" activations.append(curr_activation)\n",
" # backward pass : calculating cost by taking last elems of a, z lists\n",
" delta = (self.cost).delta(activations[-1], y, z_lis[-1])\n",
" del_w[-1] =, activations[-2].transpose())\n",
" del_b[-1] = delta\n",
" \n",
" # going back all layers\n",
" for l in range(2, self.n_layers):\n",
" z = z_lis[-l]\n",
" del_sigmoid = sigmoid_prime(z)\n",
" delta =[-l+1].transpose(), delta) * del_sigmoid\n",
" del_w[-l] =, activations[-l-1].transpose())\n",
" del_b[-l] = delta\n",
" return (del_w, del_b)\n",
" \n",
" \n",
" def initialize_weight(self):\n",
" \"\"\"\n",
" initialize weights using Gaussian Distribution with mean 0 and SD 1\n",
" over sqrt of number of weights connecting same neuron\n",
" initialize biases using Gaussian Distribution with mean 0 and SD 1\n",
" \"\"\"\n",
" self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in\n",
" zip(self.size[:-1], self.size[:-1])]\n",
" self.biases = [np.random.randn(y, 1) for y in self.size[1:]]\n",
" \n",
" \n",
" def trainer(self, training_data, epochs, m_bs, eta, lmbda, eval_data=None, \n",
" print_eval_cost=False, print_eval_acc=False, print_train_cost=False, \n",
" print_train_acc=False):\n",
" \"\"\"\n",
" Train the neural network using mini-batch stochastic gradient\n",
" descent. \n",
" \"\"\"\n",
" if eval_data:\n",
" n_data = sum(1 for _ in eval_data)\n",
" n = sum(1 for _ in training_data)\n",
" eval_cost, eval_acc = [], []\n",
" train_cost, train_acc = [], []\n",
" for c in range(epochs):\n",
" random.shuffle(training_data)\n",
" mini_batches = [training_data[k:k+m_bs] for k in\n",
" range(0, n, m_bs)]\n",
" for mini_batch in mini_batches:\n",
" self.update_mini_batch(mini_batch, eta, lmbda, len(training_data))\n",
" print('Training : Epoch % complete.' % c)\n",
" \n",
" if print_train_cost:\n",
" acc = self.accuracy(training_data, convert=True)\n",
" train_acc.append(acc)\n",
" print('Accuracy on training data : {} / {}'.format(acc, n))\n",
" if print_train_cost:\n",
" cost = self.total_cost(training_data, lmbda)\n",
" train_cost.append(cost)\n",
" print('Cost on training data : {}'.format(cost))\n",
" if print_eval_acc:\n",
" acc = self.accuracy(eval_data)\n",
" eval_acc.append(acc)\n",
" print('Accuracy on training data : {} / {}'.format(acc, n))\n",
" if print_eval_cost:\n",
" cost = self.total_cost(eval_data, lmbda, convert=True)\n",
" train_cost.append(cost)\n",
" print('Cost on training data : {}'.format(cost))\n",
" print()\n",
" return (eval_cost, eval_acc, train_cost, train_acc)\n",
" \n",
" def update_mini_batch(self, mini_batch, eta, lmbda, n):\n",
" \"\"\"Update the network's weights and biases by applying gradient\n",
" descent using backpropagation to a single mini batch.mini_batch is a list of tuples ``(x, y)``, ``eta`` is the\n",
" learning rate, lmbda is the regularization parameter, and\n",
" n is the total size of the training data set.\n",
" \"\"\"\n",
" nabla_b = [np.zeros(b.shape) for b in self.biases]\n",
" nabla_w = [np.zeros(w.shape) for w in self.weights]\n",
" for x, y in mini_batch:\n",
" delta_nabla_b, delta_nabla_w = self.backprop(x, y)\n",
" nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]\n",
" nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]\n",
" self.weights = [(1-eta*(lmbda/n))*w-(eta/len(mini_batch))*nw\n",
" for w, nw in zip(self.weights, nabla_w)]\n",
" self.biases = [b-(eta/len(mini_batch))*nb\n",
" for b, nb in zip(self.biases, nabla_b)]\n",
" \n",
" \n",
" def accuracy(self, data, convert=False):\n",
" \"\"\"Return the number of inputs in ``data`` for which the neural\n",
" network outputs the correct result. The neural network's\n",
" output is assumed to be the index of whichever neuron in the\n",
" final layer has the highest activation.\n",
" The flag ``convert`` should be set to False if the data set is\n",
" validation or test data (the usual case), and to True if the\n",
" data set is the training data. \n",
" \"\"\"\n",
" if convert:\n",
" results = [(np.argmax(self.feedforward(x)), np.argmax(y))\n",
" for (x, y) in data]\n",
" else:\n",
" results = [(np.argmax(self.feedforward(x)), y)\n",
" for (x, y) in data]\n",
" return sum(int(x == y) for (x, y) in results)\n",
" \n",
" def total_cost(self, data, lmbda, convert=False):\n",
" \"\"\"Return the total cost for the data set ``data``. \n",
" \"\"\"\n",
" cost = 0.0\n",
" for x, y in data:\n",
" a = self.feedforward(x)\n",
" if convert: y = vectorized_result(y)\n",
" cost += self.cost.fn(a, y)/len(data)\n",
" cost += 0.5*(lmbda/len(data))*sum(\n",
" np.linalg.norm(w)**2 for w in self.weights)\n",
" return cost\n",
" \n",
" def save(self, filename):\n",
" \"\"\"Save the neural network to the file ``filename``.\"\"\"\n",
" data = {\"size\": self.size,\n",
" \"weights\": [w.tolist() for w in self.weights],\n",
" \"biases\": [b.tolist() for b in self.biases],\n",
" \"cost\": str(self.cost.__name__)}\n",
" f = open(filename, \"w\")\n",
" json.dump(data, f)\n",
" f.close()"
## Loading Data
"# this python code to load MNIST Data is by Michael Nielsen.\n",
"import _pickle as cPickle\n",
"import gzip\n",
"# Third-party libraries\n",
"import numpy as np\n",
"def load_data():\n",
" \"\"\"Return the MNIST data as a tuple containing the training data,\n",
" the validation data, and the test data.\n",
" The ``training_data`` is returned as a tuple with two entries.\n",
" The first entry contains the actual training images. This is a\n",
" numpy ndarray with 50,000 entries. Each entry is, in turn, a\n",
" numpy ndarray with 784 values, representing the 28 * 28 = 784\n",
" pixels in a single MNIST image.\n",
" The second entry in the ``training_data`` tuple is a numpy ndarray\n",
" containing 50,000 entries. Those entries are just the digit\n",
" values (0...9) for the corresponding images contained in the first\n",
" entry of the tuple.\n",
" The ``validation_data`` and ``test_data`` are similar, except\n",
" each contains only 10,000 images.\n",
" This is a nice data format, but for use in neural networks it's\n",
" helpful to modify the format of the ``training_data`` a little.\n",
" That's done in the wrapper function ``load_data_wrapper()``, see\n",
" below.\n",
" \"\"\"\n",
" f ='mnist.pkl.gz', 'rb')\n",
" training_data, validation_data, test_data = cPickle.load(f, encoding='latin1')\n",
" f.close()\n",
" return (training_data, validation_data, test_data)\n",
"def load_data_wrapper():\n",
" \"\"\"Return a tuple containing ``(training_data, validation_data,\n",
" test_data)``. Based on ``load_data``, but the format is more\n",
" convenient for use in our implementation of neural networks.\n",
" In particular, ``training_data`` is a list containing 50,000\n",
" 2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray\n",
" containing the input image. ``y`` is a 10-dimensional\n",
" numpy.ndarray representing the unit vector corresponding to the\n",
" correct digit for ``x``.\n",
" ``validation_data`` and ``test_data`` are lists containing 10,000\n",
" 2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional\n",
" numpy.ndarry containing the input image, and ``y`` is the\n",
" corresponding classification, i.e., the digit values (integers)\n",
" corresponding to ``x``.\n",
" Obviously, this means we're using slightly different formats for\n",
" the training data and the validation / test data. These formats\n",
" turn out to be the most convenient for use in our neural network\n",
" code.\"\"\"\n",
" tr_d, va_d, te_d = load_data()\n",
" training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]\n",
" training_results = [vectorized_result(y) for y in tr_d[1]]\n",
" training_data = zip(training_inputs, training_results)\n",
" validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]\n",
" validation_data = zip(validation_inputs, va_d[1])\n",
" test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]\n",
" test_data = zip(test_inputs, te_d[1])\n",
" return (training_data, validation_data, test_data)\n",
"def vectorized_result(j):\n",
" \"\"\"Return a 10-dimensional unit vector with a 1.0 in the jth\n",
" position and zeroes elsewhere. This is used to convert a digit\n",
" (0...9) into a corresponding desired output from the neural\n",
" network.\"\"\"\n",
" e = np.zeros((10, 1))\n",
" e[j] = 1.0\n",
" return e"
"training_data, validation_data, test_data = load_data_wrapper()"
"learner = NN([784, 30, 10])"
