Skip to content

Instantly share code, notes, and snippets.

@mdvsh
Last active April 12, 2020 07:04
Show Gist options
  • Save mdvsh/bfd6a4ec7f3c755a7c9626f2377b4cee to your computer and use it in GitHub Desktop.
Save mdvsh/bfd6a4ec7f3c755a7c9626f2377b4cee to your computer and use it in GitHub Desktop.
Notes and Implementation of Back-propagation Algorithm
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Notes and Implementation of Backpropagation Algorithm\n",
"### A third attempt (finally a successful one) to understand the mechanics behind a neural network FT. Calculus\n",
"\n",
"---\n",
"\n",
"## Explaination and Math"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1,3 | 2,4\n",
":-------------------------:|:-------------------------:\n",
"![](https://i.imgur.com/2y312KC.jpg) | ![](https://i.imgur.com/F6Afhfy.jpg)\n",
"![](https://i.imgur.com/OamO0UN.jpg) | ![](https://i.imgur.com/zlAyNq4.jpg)\n",
"\n",
"\n",
"---"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Notation used\n",
"\n",
"\n",
"**Weights** | **Biases (star)**\n",
":-------------------------:|:-------------------------:\n",
"![](http://neuralnetworksanddeeplearning.com/images/tikz16.png) | ![](http://neuralnetworksanddeeplearning.com/images/tikz17.png)\n",
"\n",
"#### *read notes to better understand*\n",
"\n",
"### Formulas to be further used in code : \n",
"\n",
"![](http://neuralnetworksanddeeplearning.com/images/tikz21.png)\n",
"\n",
"---\n",
"\n",
"### Code"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import numpy as np\n",
"import random\n",
"import matplotlib.pyplot as plt\n",
"import json\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# helper functions (here, activation functions)\n",
"def sigmoid(z):\n",
" \"\"\"the sigmoid activation function\"\"\"\n",
" return 1/(1+np.exp(-1*x))\n",
"\n",
"def sigmoid_prime(x):\n",
" \"\"\"the derivative of the sigmoid function\"\"\"\n",
" return sigmoid(z)*(1-sigmoid(z))\n",
"\n",
"def relu(z):\n",
" \"\"\"the ReLU activation function\"\"\"\n",
" return np.maximum(0,z)\n",
"\n",
"def one_hot_encoding(j):\n",
" \"\"\"One hot encode to a 10-dimensional unit vector with prediction\"\"\"\n",
" encoded_vec = np.zeroes((10, 1))\n",
" encoded_vec[j] = 1.0\n",
" return ohe"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# loss functions : here I'll try using both CrossEntropyLoss (losLoss) and \n",
"# QuadraticLoss functions to compare their performance.\n",
"# https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html : Formulas\n",
"# employed decent OOP practices\n",
"\n",
"class CrossEntropyCost(object):\n",
" \n",
" @staticmethod\n",
" def func(a, y):\n",
" \"\"\"\n",
" return : cost associated with input a and desired output y\n",
" sometime, when a = y, the formula for CrossEntropy returns NaN\n",
" Formula : (1-y)*np.log(1-a) \n",
" hence, np.nan_to_num is used to convert NaN's to (0.0)\n",
" \"\"\"\n",
" return np.sum(np.nan_to_numn(-y * np.log(a) - (1-y) * np.log(1-a)))\n",
" \n",
" @staticmethod\n",
" def delta(a, y, z):\n",
" return (a - y)\n",
" \n",
"class MSE_cost(object):\n",
" \n",
" @staticmethod\n",
" def func(a, y):\n",
" \"\"\"return : cost associated with input a and desired output y\"\"\"\n",
" return np.linalg.norm(a - y) * 0.5 ** 2\n",
"\n",
" @staticmethod\n",
" def delta(a, y, z):\n",
" \"\"\"params a, y follow suite\n",
" z is the value of the neuron. from our derivation\n",
" \"\"\"\n",
" return (a - y) * sigmoid_prime(z)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"class NN(object):\n",
" \n",
" \n",
" def __init__(self, size, cost = CrossEntropyCost):\n",
" \"\"\"\n",
" list::size : number of neurons in respective layers of the network\n",
" weights and biases are generated randomly through Gaussian Distribution with zero mean and variance of 1.\n",
" \"\"\"\n",
" self.n_layers = len(size)\n",
" self.size = size\n",
" # initializing weights only for 2nd to last layer since 1st layer is input layer (lacks weights)\n",
" self.biases = [np.random.randn(y, 1) for y in self.size[1:]]\n",
" self.weights = [np.random.randn(y, x)/np.sqrt(x)\n",
" for x, y in zip(self.size[:-1], self.size[1:])]\n",
" self.cost = cost\n",
" \n",
" \n",
" def forward_propagation(self, a):\n",
" \"\"\"The neuron calculation formula : Wa+b\"\"\"\n",
" for w, b in zip(self.weights, self.biases):\n",
" a = sigmoid(np.dot(w, a) + b)\n",
" return a\n",
"\n",
" \n",
" def back_propagation(self, x, y):\n",
" \"\"\"\n",
" return : (del_w, del_b) , the gradient for the cost function\n",
" del_w and del_b are layer-by-layer lists of numpy arrays.\n",
" Warning : negative indices would be heavily utilized\n",
" \"\"\"\n",
" del_w = [np.zeros(w.shape) for w in self.weights]\n",
" del_b = [np.zeroes(b.shape) for b in self.biases]\n",
" # forward prop\n",
" curr_activation = x\n",
" activations = [x] # store all activations by layer, remember chain-rule \n",
" z_lis = [] # store all z values for layers, remember tree structure from notes\n",
" for w, b in zip(self.weights, self.biases):\n",
" z = np.dot(w, curr_activation) + b\n",
" curr_activation = sigmoid(z)\n",
" z_lis.append(z)\n",
" activations.append(curr_activation)\n",
" # backward pass : calculating cost by taking last elems of a, z lists\n",
" delta = (self.cost).delta(activations[-1], y, z_lis[-1])\n",
" del_w[-1] = np.dot(delta, activations[-2].transpose())\n",
" del_b[-1] = delta\n",
" \n",
" # going back all layers\n",
" for l in range(2, self.n_layers):\n",
" z = z_lis[-l]\n",
" del_sigmoid = sigmoid_prime(z)\n",
" delta = np.dot(self.weights[-l+1].transpose(), delta) * del_sigmoid\n",
" del_w[-l] = np.dot(delta, activations[-l-1].transpose())\n",
" del_b[-l] = delta\n",
" return (del_w, del_b)\n",
" \n",
" \n",
" def initialize_weight(self):\n",
" \"\"\"\n",
" initialize weights using Gaussian Distribution with mean 0 and SD 1\n",
" over sqrt of number of weights connecting same neuron\n",
" initialize biases using Gaussian Distribution with mean 0 and SD 1\n",
" \"\"\"\n",
" self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in\n",
" zip(self.size[:-1], self.size[:-1])]\n",
" self.biases = [np.random.randn(y, 1) for y in self.size[1:]]\n",
" \n",
" \n",
" def trainer(self, training_data, epochs, m_bs, eta, lmbda, eval_data=None, \n",
" print_eval_cost=False, print_eval_acc=False, print_train_cost=False, \n",
" print_train_acc=False):\n",
" \"\"\"\n",
" Train the neural network using mini-batch stochastic gradient\n",
" descent. \n",
" \"\"\"\n",
" if eval_data:\n",
" n_data = sum(1 for _ in eval_data)\n",
" n = sum(1 for _ in training_data)\n",
" eval_cost, eval_acc = [], []\n",
" train_cost, train_acc = [], []\n",
" for c in range(epochs):\n",
" random.shuffle(training_data)\n",
" mini_batches = [training_data[k:k+m_bs] for k in\n",
" range(0, n, m_bs)]\n",
" for mini_batch in mini_batches:\n",
" self.update_mini_batch(mini_batch, eta, lmbda, len(training_data))\n",
" print('Training : Epoch % complete.' % c)\n",
" \n",
" if print_train_cost:\n",
" acc = self.accuracy(training_data, convert=True)\n",
" train_acc.append(acc)\n",
" print('Accuracy on training data : {} / {}'.format(acc, n))\n",
" if print_train_cost:\n",
" cost = self.total_cost(training_data, lmbda)\n",
" train_cost.append(cost)\n",
" print('Cost on training data : {}'.format(cost))\n",
" if print_eval_acc:\n",
" acc = self.accuracy(eval_data)\n",
" eval_acc.append(acc)\n",
" print('Accuracy on training data : {} / {}'.format(acc, n))\n",
" if print_eval_cost:\n",
" cost = self.total_cost(eval_data, lmbda, convert=True)\n",
" train_cost.append(cost)\n",
" print('Cost on training data : {}'.format(cost))\n",
" print()\n",
" return (eval_cost, eval_acc, train_cost, train_acc)\n",
" \n",
" def update_mini_batch(self, mini_batch, eta, lmbda, n):\n",
" \"\"\"Update the network's weights and biases by applying gradient\n",
" descent using backpropagation to a single mini batch.mini_batch is a list of tuples ``(x, y)``, ``eta`` is the\n",
" learning rate, lmbda is the regularization parameter, and\n",
" n is the total size of the training data set.\n",
" \"\"\"\n",
" nabla_b = [np.zeros(b.shape) for b in self.biases]\n",
" nabla_w = [np.zeros(w.shape) for w in self.weights]\n",
" for x, y in mini_batch:\n",
" delta_nabla_b, delta_nabla_w = self.backprop(x, y)\n",
" nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]\n",
" nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]\n",
" self.weights = [(1-eta*(lmbda/n))*w-(eta/len(mini_batch))*nw\n",
" for w, nw in zip(self.weights, nabla_w)]\n",
" self.biases = [b-(eta/len(mini_batch))*nb\n",
" for b, nb in zip(self.biases, nabla_b)]\n",
" \n",
" \n",
" def accuracy(self, data, convert=False):\n",
" \"\"\"Return the number of inputs in ``data`` for which the neural\n",
" network outputs the correct result. The neural network's\n",
" output is assumed to be the index of whichever neuron in the\n",
" final layer has the highest activation.\n",
" The flag ``convert`` should be set to False if the data set is\n",
" validation or test data (the usual case), and to True if the\n",
" data set is the training data. \n",
" \"\"\"\n",
" if convert:\n",
" results = [(np.argmax(self.feedforward(x)), np.argmax(y))\n",
" for (x, y) in data]\n",
" else:\n",
" results = [(np.argmax(self.feedforward(x)), y)\n",
" for (x, y) in data]\n",
" return sum(int(x == y) for (x, y) in results)\n",
"\n",
" \n",
" def total_cost(self, data, lmbda, convert=False):\n",
" \"\"\"Return the total cost for the data set ``data``. \n",
" \"\"\"\n",
" cost = 0.0\n",
" for x, y in data:\n",
" a = self.feedforward(x)\n",
" if convert: y = vectorized_result(y)\n",
" cost += self.cost.fn(a, y)/len(data)\n",
" cost += 0.5*(lmbda/len(data))*sum(\n",
" np.linalg.norm(w)**2 for w in self.weights)\n",
" return cost\n",
"\n",
" \n",
" def save(self, filename):\n",
" \"\"\"Save the neural network to the file ``filename``.\"\"\"\n",
" data = {\"size\": self.size,\n",
" \"weights\": [w.tolist() for w in self.weights],\n",
" \"biases\": [b.tolist() for b in self.biases],\n",
" \"cost\": str(self.cost.__name__)}\n",
" f = open(filename, \"w\")\n",
" json.dump(data, f)\n",
" f.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading Data"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"# this python code to load MNIST Data is by Michael Nielsen.\n",
"import _pickle as cPickle\n",
"import gzip\n",
"\n",
"# Third-party libraries\n",
"import numpy as np\n",
"\n",
"def load_data():\n",
" \"\"\"Return the MNIST data as a tuple containing the training data,\n",
" the validation data, and the test data.\n",
" The ``training_data`` is returned as a tuple with two entries.\n",
" The first entry contains the actual training images. This is a\n",
" numpy ndarray with 50,000 entries. Each entry is, in turn, a\n",
" numpy ndarray with 784 values, representing the 28 * 28 = 784\n",
" pixels in a single MNIST image.\n",
" The second entry in the ``training_data`` tuple is a numpy ndarray\n",
" containing 50,000 entries. Those entries are just the digit\n",
" values (0...9) for the corresponding images contained in the first\n",
" entry of the tuple.\n",
" The ``validation_data`` and ``test_data`` are similar, except\n",
" each contains only 10,000 images.\n",
" This is a nice data format, but for use in neural networks it's\n",
" helpful to modify the format of the ``training_data`` a little.\n",
" That's done in the wrapper function ``load_data_wrapper()``, see\n",
" below.\n",
" \"\"\"\n",
" f = gzip.open('mnist.pkl.gz', 'rb')\n",
" training_data, validation_data, test_data = cPickle.load(f, encoding='latin1')\n",
" f.close()\n",
" return (training_data, validation_data, test_data)\n",
"\n",
"def load_data_wrapper():\n",
" \"\"\"Return a tuple containing ``(training_data, validation_data,\n",
" test_data)``. Based on ``load_data``, but the format is more\n",
" convenient for use in our implementation of neural networks.\n",
" In particular, ``training_data`` is a list containing 50,000\n",
" 2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray\n",
" containing the input image. ``y`` is a 10-dimensional\n",
" numpy.ndarray representing the unit vector corresponding to the\n",
" correct digit for ``x``.\n",
" ``validation_data`` and ``test_data`` are lists containing 10,000\n",
" 2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional\n",
" numpy.ndarry containing the input image, and ``y`` is the\n",
" corresponding classification, i.e., the digit values (integers)\n",
" corresponding to ``x``.\n",
" Obviously, this means we're using slightly different formats for\n",
" the training data and the validation / test data. These formats\n",
" turn out to be the most convenient for use in our neural network\n",
" code.\"\"\"\n",
" tr_d, va_d, te_d = load_data()\n",
" training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]\n",
" training_results = [vectorized_result(y) for y in tr_d[1]]\n",
" training_data = zip(training_inputs, training_results)\n",
" validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]\n",
" validation_data = zip(validation_inputs, va_d[1])\n",
" test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]\n",
" test_data = zip(test_inputs, te_d[1])\n",
" return (training_data, validation_data, test_data)\n",
"\n",
"def vectorized_result(j):\n",
" \"\"\"Return a 10-dimensional unit vector with a 1.0 in the jth\n",
" position and zeroes elsewhere. This is used to convert a digit\n",
" (0...9) into a corresponding desired output from the neural\n",
" network.\"\"\"\n",
" e = np.zeros((10, 1))\n",
" e[j] = 1.0\n",
" return e"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"training_data, validation_data, test_data = load_data_wrapper()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"learner = NN([784, 30, 10])"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "object of type 'zip' has no len()",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-44-742e24694131>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mlearner\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtraining_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m30\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3.0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0.1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meval_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtest_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-40-6f3c6ca52295>\u001b[0m in \u001b[0;36mtrainer\u001b[0;34m(self, training_data, epochs, m_bs, eta, lmbda, eval_data, print_eval_cost, print_eval_acc, print_train_cost, print_train_acc)\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0mtrain_cost\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_acc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mepochs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 81\u001b[0;31m \u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshuffle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtraining_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 82\u001b[0m mini_batches = [training_data[k:k+m_bs] for k in\n\u001b[1;32m 83\u001b[0m range(0, n, m_bs)]\n",
"\u001b[0;32m~/anaconda3/envs/fastai/lib/python3.7/random.py\u001b[0m in \u001b[0;36mshuffle\u001b[0;34m(self, x, random)\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrandom\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0mrandbelow\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_randbelow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 275\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mreversed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 276\u001b[0m \u001b[0;31m# pick an element in x[:i+1] with which to exchange x[i]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 277\u001b[0m \u001b[0mj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrandbelow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: object of type 'zip' has no len()"
]
}
],
"source": [
"learner.trainer(training_data, 30, 10, 3.0, 0.1, eval_data=test_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"|"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
@mdvsh
Copy link
Author

mdvsh commented Apr 12, 2020

Other Resources to dive more into Backpropagation Calculus

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment