mdvsh/backprop-notes-and-implementation.ipynb

## backprop-notes-and-implementation.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Notes and Implementation of Backpropagation Algorithm\n",
    "### A third attempt (finally a successful one) to understand the mechanics behind a neural network FT. Calculus\n",
    "\n",
    "---\n",
    "\n",
    "## Explaination and Math"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1,3             |  2,4\n",
    ":-------------------------:|:-------------------------:\n",
    "![](https://i.imgur.com/2y312KC.jpg) |  ![](https://i.imgur.com/F6Afhfy.jpg)\n",
    "![](https://i.imgur.com/OamO0UN.jpg) |  ![](https://i.imgur.com/zlAyNq4.jpg)\n",
    "\n",
    "\n",
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Notation used\n",
    "\n",
    "\n",
    "**Weights**            |  **Biases (star)**\n",
    ":-------------------------:|:-------------------------:\n",
    "![](http://neuralnetworksanddeeplearning.com/images/tikz16.png) |  ![](http://neuralnetworksanddeeplearning.com/images/tikz17.png)\n",
    "\n",
    "#### *read notes to better understand*\n",
    "\n",
    "### Formulas to be further used in code : \n",
    "\n",
    "![](http://neuralnetworksanddeeplearning.com/images/tikz21.png)\n",
    "\n",
    "---\n",
    "\n",
    "### Code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import random\n",
    "import matplotlib.pyplot as plt\n",
    "import json\n",
    "import sys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# helper functions (here, activation functions)\n",
    "def sigmoid(z):\n",
    "    \"\"\"the sigmoid activation function\"\"\"\n",
    "    return 1/(1+np.exp(-1*x))\n",
    "\n",
    "def sigmoid_prime(x):\n",
    "    \"\"\"the derivative of the sigmoid function\"\"\"\n",
    "    return sigmoid(z)*(1-sigmoid(z))\n",
    "\n",
    "def relu(z):\n",
    "    \"\"\"the ReLU activation function\"\"\"\n",
    "    return np.maximum(0,z)\n",
    "\n",
    "def one_hot_encoding(j):\n",
    "    \"\"\"One hot encode to a 10-dimensional unit vector with prediction\"\"\"\n",
    "    encoded_vec = np.zeroes((10, 1))\n",
    "    encoded_vec[j] = 1.0\n",
    "    return ohe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# loss functions : here I'll try using both CrossEntropyLoss (losLoss) and \n",
    "# QuadraticLoss functions to compare their performance.\n",
    "# https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html : Formulas\n",
    "# employed decent OOP practices\n",
    "\n",
    "class CrossEntropyCost(object):\n",
    "    \n",
    "    @staticmethod\n",
    "    def func(a, y):\n",
    "        \"\"\"\n",
    "        return : cost associated with input a and desired output y\n",
    "        sometime, when a = y, the formula for CrossEntropy returns NaN\n",
    "        Formula : (1-y)*np.log(1-a) \n",
    "        hence, np.nan_to_num is used to convert NaN's to (0.0)\n",
    "        \"\"\"\n",
    "        return np.sum(np.nan_to_numn(-y * np.log(a) - (1-y) * np.log(1-a)))\n",
    "    \n",
    "    @staticmethod\n",
    "    def delta(a, y, z):\n",
    "        return (a - y)\n",
    "    \n",
    "class MSE_cost(object):\n",
    "    \n",
    "    @staticmethod\n",
    "    def func(a, y):\n",
    "        \"\"\"return : cost associated with input a and desired output y\"\"\"\n",
    "        return np.linalg.norm(a - y) * 0.5 ** 2\n",
    "\n",
    "    @staticmethod\n",
    "    def delta(a, y, z):\n",
    "        \"\"\"params a, y follow suite\n",
    "            z is the value of the neuron. from our derivation\n",
    "        \"\"\"\n",
    "        return (a - y) * sigmoid_prime(z)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "class NN(object):\n",
    "    \n",
    "    \n",
    "    def __init__(self, size, cost = CrossEntropyCost):\n",
    "        \"\"\"\n",
    "        list::size : number of neurons in respective layers of the network\n",
    "        weights and biases are generated randomly through Gaussian Distribution with zero mean and variance of 1.\n",
    "        \"\"\"\n",
    "        self.n_layers = len(size)\n",
    "        self.size = size\n",
    "        # initializing weights only for 2nd to last layer since 1st layer is input layer (lacks weights)\n",
    "        self.biases = [np.random.randn(y, 1) for y in self.size[1:]]\n",
    "        self.weights = [np.random.randn(y, x)/np.sqrt(x)\n",
    "                        for x, y in zip(self.size[:-1], self.size[1:])]\n",
    "        self.cost = cost\n",
    "        \n",
    "        \n",
    "    def forward_propagation(self, a):\n",
    "        \"\"\"The neuron calculation formula : Wa+b\"\"\"\n",
    "        for w, b in zip(self.weights, self.biases):\n",
    "            a = sigmoid(np.dot(w, a) + b)\n",
    "        return a\n",
    "\n",
    "    \n",
    "    def back_propagation(self, x, y):\n",
    "        \"\"\"\n",
    "        return : (del_w, del_b) , the gradient for the cost function\n",
    "        del_w and del_b are layer-by-layer lists of numpy arrays.\n",
    "        Warning : negative indices would be heavily utilized\n",
    "        \"\"\"\n",
    "        del_w = [np.zeros(w.shape) for w in self.weights]\n",
    "        del_b = [np.zeroes(b.shape) for b in self.biases]\n",
    "        # forward prop\n",
    "        curr_activation = x\n",
    "        activations = [x] # store all activations by layer, remember chain-rule \n",
    "        z_lis = [] # store all z values for layers, remember tree structure from notes\n",
    "        for w, b in zip(self.weights, self.biases):\n",
    "            z = np.dot(w, curr_activation) + b\n",
    "            curr_activation = sigmoid(z)\n",
    "            z_lis.append(z)\n",
    "            activations.append(curr_activation)\n",
    "        # backward pass : calculating cost by taking last elems of a, z lists\n",
    "        delta = (self.cost).delta(activations[-1], y, z_lis[-1])\n",
    "        del_w[-1] = np.dot(delta, activations[-2].transpose())\n",
    "        del_b[-1] = delta\n",
    "        \n",
    "        # going back all layers\n",
    "        for l in range(2, self.n_layers):\n",
    "            z = z_lis[-l]\n",
    "            del_sigmoid = sigmoid_prime(z)\n",
    "            delta = np.dot(self.weights[-l+1].transpose(), delta) * del_sigmoid\n",
    "            del_w[-l] = np.dot(delta, activations[-l-1].transpose())\n",
    "            del_b[-l] = delta\n",
    "        return (del_w, del_b)\n",
    "    \n",
    "    \n",
    "    def initialize_weight(self):\n",
    "        \"\"\"\n",
    "        initialize weights using Gaussian Distribution with mean 0 and SD  1\n",
    "        over sqrt of number of weights connecting same neuron\n",
    "        initialize biases using Gaussian Distribution with mean 0 and SD 1\n",
    "        \"\"\"\n",
    "        self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in\n",
    "                        zip(self.size[:-1], self.size[:-1])]\n",
    "        self.biases = [np.random.randn(y, 1) for y in self.size[1:]]\n",
    "        \n",
    "        \n",
    "    def trainer(self, training_data, epochs, m_bs, eta, lmbda, eval_data=None, \n",
    "               print_eval_cost=False, print_eval_acc=False, print_train_cost=False, \n",
    "               print_train_acc=False):\n",
    "        \"\"\"\n",
    "        Train the neural network using mini-batch stochastic gradient\n",
    "        descent. \n",
    "        \"\"\"\n",
    "        if eval_data:\n",
    "            n_data = sum(1 for _ in eval_data)\n",
    "        n = sum(1 for _ in training_data)\n",
    "        eval_cost, eval_acc = [], []\n",
    "        train_cost, train_acc = [], []\n",
    "        for c in range(epochs):\n",
    "            random.shuffle(training_data)\n",
    "            mini_batches = [training_data[k:k+m_bs] for k in\n",
    "                           range(0, n, m_bs)]\n",
    "            for mini_batch in mini_batches:\n",
    "                self.update_mini_batch(mini_batch, eta, lmbda, len(training_data))\n",
    "            print('Training : Epoch % complete.' % c)\n",
    "            \n",
    "            if print_train_cost:\n",
    "                acc = self.accuracy(training_data, convert=True)\n",
    "                train_acc.append(acc)\n",
    "                print('Accuracy on training data : {} / {}'.format(acc, n))\n",
    "            if print_train_cost:\n",
    "                cost = self.total_cost(training_data, lmbda)\n",
    "                train_cost.append(cost)\n",
    "                print('Cost on training data : {}'.format(cost))\n",
    "            if print_eval_acc:\n",
    "                acc = self.accuracy(eval_data)\n",
    "                eval_acc.append(acc)\n",
    "                print('Accuracy on training data : {} / {}'.format(acc, n))\n",
    "            if print_eval_cost:\n",
    "                cost = self.total_cost(eval_data, lmbda, convert=True)\n",
    "                train_cost.append(cost)\n",
    "                print('Cost on training data : {}'.format(cost))\n",
    "            print()\n",
    "        return (eval_cost, eval_acc, train_cost, train_acc)\n",
    "    \n",
    "    def update_mini_batch(self, mini_batch, eta, lmbda, n):\n",
    "        \"\"\"Update the network's weights and biases by applying gradient\n",
    "        descent using backpropagation to a single mini batch.mini_batch is a list of tuples ``(x, y)``, ``eta`` is the\n",
    "        learning rate, lmbda is the regularization parameter, and\n",
    "        n is the total size of the training data set.\n",
    "        \"\"\"\n",
    "        nabla_b = [np.zeros(b.shape) for b in self.biases]\n",
    "        nabla_w = [np.zeros(w.shape) for w in self.weights]\n",
    "        for x, y in mini_batch:\n",
    "            delta_nabla_b, delta_nabla_w = self.backprop(x, y)\n",
    "            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]\n",
    "            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]\n",
    "        self.weights = [(1-eta*(lmbda/n))*w-(eta/len(mini_batch))*nw\n",
    "                        for w, nw in zip(self.weights, nabla_w)]\n",
    "        self.biases = [b-(eta/len(mini_batch))*nb\n",
    "                       for b, nb in zip(self.biases, nabla_b)]\n",
    "    \n",
    "    \n",
    "    def accuracy(self, data, convert=False):\n",
    "        \"\"\"Return the number of inputs in ``data`` for which the neural\n",
    "        network outputs the correct result. The neural network's\n",
    "        output is assumed to be the index of whichever neuron in the\n",
    "        final layer has the highest activation.\n",
    "        The flag ``convert`` should be set to False if the data set is\n",
    "        validation or test data (the usual case), and to True if the\n",
    "        data set is the training data. \n",
    "        \"\"\"\n",
    "        if convert:\n",
    "            results = [(np.argmax(self.feedforward(x)), np.argmax(y))\n",
    "                       for (x, y) in data]\n",
    "        else:\n",
    "            results = [(np.argmax(self.feedforward(x)), y)\n",
    "                        for (x, y) in data]\n",
    "        return sum(int(x == y) for (x, y) in results)\n",
    "\n",
    "    \n",
    "    def total_cost(self, data, lmbda, convert=False):\n",
    "        \"\"\"Return the total cost for the data set ``data``. \n",
    "        \"\"\"\n",
    "        cost = 0.0\n",
    "        for x, y in data:\n",
    "            a = self.feedforward(x)\n",
    "            if convert: y = vectorized_result(y)\n",
    "            cost += self.cost.fn(a, y)/len(data)\n",
    "        cost += 0.5*(lmbda/len(data))*sum(\n",
    "            np.linalg.norm(w)**2 for w in self.weights)\n",
    "        return cost\n",
    "\n",
    "    \n",
    "    def save(self, filename):\n",
    "        \"\"\"Save the neural network to the file ``filename``.\"\"\"\n",
    "        data = {\"size\": self.size,\n",
    "                \"weights\": [w.tolist() for w in self.weights],\n",
    "                \"biases\": [b.tolist() for b in self.biases],\n",
    "                \"cost\": str(self.cost.__name__)}\n",
    "        f = open(filename, \"w\")\n",
    "        json.dump(data, f)\n",
    "        f.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Loading Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "# this python code to load MNIST Data is by Michael Nielsen.\n",
    "import _pickle as cPickle\n",
    "import gzip\n",
    "\n",
    "# Third-party libraries\n",
    "import numpy as np\n",
    "\n",
    "def load_data():\n",
    "    \"\"\"Return the MNIST data as a tuple containing the training data,\n",
    "    the validation data, and the test data.\n",
    "    The ``training_data`` is returned as a tuple with two entries.\n",
    "    The first entry contains the actual training images.  This is a\n",
    "    numpy ndarray with 50,000 entries.  Each entry is, in turn, a\n",
    "    numpy ndarray with 784 values, representing the 28 * 28 = 784\n",
    "    pixels in a single MNIST image.\n",
    "    The second entry in the ``training_data`` tuple is a numpy ndarray\n",
    "    containing 50,000 entries.  Those entries are just the digit\n",
    "    values (0...9) for the corresponding images contained in the first\n",
    "    entry of the tuple.\n",
    "    The ``validation_data`` and ``test_data`` are similar, except\n",
    "    each contains only 10,000 images.\n",
    "    This is a nice data format, but for use in neural networks it's\n",
    "    helpful to modify the format of the ``training_data`` a little.\n",
    "    That's done in the wrapper function ``load_data_wrapper()``, see\n",
    "    below.\n",
    "    \"\"\"\n",
    "    f = gzip.open('mnist.pkl.gz', 'rb')\n",
    "    training_data, validation_data, test_data = cPickle.load(f, encoding='latin1')\n",
    "    f.close()\n",
    "    return (training_data, validation_data, test_data)\n",
    "\n",
    "def load_data_wrapper():\n",
    "    \"\"\"Return a tuple containing ``(training_data, validation_data,\n",
    "    test_data)``. Based on ``load_data``, but the format is more\n",
    "    convenient for use in our implementation of neural networks.\n",
    "    In particular, ``training_data`` is a list containing 50,000\n",
    "    2-tuples ``(x, y)``.  ``x`` is a 784-dimensional numpy.ndarray\n",
    "    containing the input image.  ``y`` is a 10-dimensional\n",
    "    numpy.ndarray representing the unit vector corresponding to the\n",
    "    correct digit for ``x``.\n",
    "    ``validation_data`` and ``test_data`` are lists containing 10,000\n",
    "    2-tuples ``(x, y)``.  In each case, ``x`` is a 784-dimensional\n",
    "    numpy.ndarry containing the input image, and ``y`` is the\n",
    "    corresponding classification, i.e., the digit values (integers)\n",
    "    corresponding to ``x``.\n",
    "    Obviously, this means we're using slightly different formats for\n",
    "    the training data and the validation / test data.  These formats\n",
    "    turn out to be the most convenient for use in our neural network\n",
    "    code.\"\"\"\n",
    "    tr_d, va_d, te_d = load_data()\n",
    "    training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]\n",
    "    training_results = [vectorized_result(y) for y in tr_d[1]]\n",
    "    training_data = zip(training_inputs, training_results)\n",
    "    validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]\n",
    "    validation_data = zip(validation_inputs, va_d[1])\n",
    "    test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]\n",
    "    test_data = zip(test_inputs, te_d[1])\n",
    "    return (training_data, validation_data, test_data)\n",
    "\n",
    "def vectorized_result(j):\n",
    "    \"\"\"Return a 10-dimensional unit vector with a 1.0 in the jth\n",
    "    position and zeroes elsewhere.  This is used to convert a digit\n",
    "    (0...9) into a corresponding desired output from the neural\n",
    "    network.\"\"\"\n",
    "    e = np.zeros((10, 1))\n",
    "    e[j] = 1.0\n",
    "    return e"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_data, validation_data, test_data = load_data_wrapper()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "learner = NN([784, 30, 10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "object of type 'zip' has no len()",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-44-742e24694131>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mlearner\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtraining_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m30\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3.0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0.1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meval_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtest_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-40-6f3c6ca52295>\u001b[0m in \u001b[0;36mtrainer\u001b[0;34m(self, training_data, epochs, m_bs, eta, lmbda, eval_data, print_eval_cost, print_eval_acc, print_train_cost, print_train_acc)\u001b[0m\n\u001b[1;32m     79\u001b[0m         \u001b[0mtrain_cost\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_acc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     80\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mepochs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 81\u001b[0;31m             \u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshuffle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtraining_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     82\u001b[0m             mini_batches = [training_data[k:k+m_bs] for k in\n\u001b[1;32m     83\u001b[0m                            range(0, n, m_bs)]\n",
      "\u001b[0;32m~/anaconda3/envs/fastai/lib/python3.7/random.py\u001b[0m in \u001b[0;36mshuffle\u001b[0;34m(self, x, random)\u001b[0m\n\u001b[1;32m    273\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mrandom\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    274\u001b[0m             \u001b[0mrandbelow\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_randbelow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 275\u001b[0;31m             \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mreversed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    276\u001b[0m                 \u001b[0;31m# pick an element in x[:i+1] with which to exchange x[i]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    277\u001b[0m                 \u001b[0mj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrandbelow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mTypeError\u001b[0m: object of type 'zip' has no len()"
     ]
    }
   ],
   "source": [
    "learner.trainer(training_data, 30, 10, 3.0, 0.1, eval_data=test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "|"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Notes and Implementation of Backpropagation Algorithm\n",
	"### A third attempt (finally a successful one) to understand the mechanics behind a neural network FT. Calculus\n",
	"\n",
	"---\n",
	"\n",
	"## Explaination and Math"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"1,3 \| 2,4\n",
	":-------------------------:\|:-------------------------:\n",
	"![](https://i.imgur.com/2y312KC.jpg) \| ![](https://i.imgur.com/F6Afhfy.jpg)\n",
	"![](https://i.imgur.com/OamO0UN.jpg) \| ![](https://i.imgur.com/zlAyNq4.jpg)\n",
	"\n",
	"\n",
	"---"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Notation used\n",
	"\n",
	"\n",
	"Weights \| Biases (star)\n",
	":-------------------------:\|:-------------------------:\n",
	"![](http://neuralnetworksanddeeplearning.com/images/tikz16.png) \| ![](http://neuralnetworksanddeeplearning.com/images/tikz17.png)\n",
	"\n",
	"#### read notes to better understand\n",
	"\n",
	"### Formulas to be further used in code : \n",
	"\n",
	"![](http://neuralnetworksanddeeplearning.com/images/tikz21.png)\n",
	"\n",
	"---\n",
	"\n",
	"### Code"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"%matplotlib inline\n",
	"import numpy as np\n",
	"import random\n",
	"import matplotlib.pyplot as plt\n",
	"import json\n",
	"import sys"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"# helper functions (here, activation functions)\n",
	"def sigmoid(z):\n",
	" \"\"\"the sigmoid activation function\"\"\"\n",
	" return 1/(1+np.exp(-1*x))\n",
	"\n",
	"def sigmoid_prime(x):\n",
	" \"\"\"the derivative of the sigmoid function\"\"\"\n",
	" return sigmoid(z)*(1-sigmoid(z))\n",
	"\n",
	"def relu(z):\n",
	" \"\"\"the ReLU activation function\"\"\"\n",
	" return np.maximum(0,z)\n",
	"\n",
	"def one_hot_encoding(j):\n",
	" \"\"\"One hot encode to a 10-dimensional unit vector with prediction\"\"\"\n",
	" encoded_vec = np.zeroes((10, 1))\n",
	" encoded_vec[j] = 1.0\n",
	" return ohe"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"# loss functions : here I'll try using both CrossEntropyLoss (losLoss) and \n",
	"# QuadraticLoss functions to compare their performance.\n",
	"# https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html : Formulas\n",
	"# employed decent OOP practices\n",
	"\n",
	"class CrossEntropyCost(object):\n",
	" \n",
	" @staticmethod\n",
	" def func(a, y):\n",
	" \"\"\"\n",
	" return : cost associated with input a and desired output y\n",
	" sometime, when a = y, the formula for CrossEntropy returns NaN\n",
	" Formula : (1-y)*np.log(1-a) \n",
	" hence, np.nan_to_num is used to convert NaN's to (0.0)\n",
	" \"\"\"\n",
	" return np.sum(np.nan_to_numn(-y * np.log(a) - (1-y) * np.log(1-a)))\n",
	" \n",
	" @staticmethod\n",
	" def delta(a, y, z):\n",
	" return (a - y)\n",
	" \n",
	"class MSE_cost(object):\n",
	" \n",
	" @staticmethod\n",
	" def func(a, y):\n",
	" \"\"\"return : cost associated with input a and desired output y\"\"\"\n",
	" return np.linalg.norm(a - y) * 0.5 ** 2\n",
	"\n",
	" @staticmethod\n",
	" def delta(a, y, z):\n",
	" \"\"\"params a, y follow suite\n",
	" z is the value of the neuron. from our derivation\n",
	" \"\"\"\n",
	" return (a - y) * sigmoid_prime(z)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 40,
	"metadata": {},
	"outputs": [],
	"source": [
	"class NN(object):\n",
	" \n",
	" \n",
	" def __init__(self, size, cost = CrossEntropyCost):\n",
	" \"\"\"\n",
	" list::size : number of neurons in respective layers of the network\n",
	" weights and biases are generated randomly through Gaussian Distribution with zero mean and variance of 1.\n",
	" \"\"\"\n",
	" self.n_layers = len(size)\n",
	" self.size = size\n",
	" # initializing weights only for 2nd to last layer since 1st layer is input layer (lacks weights)\n",
	" self.biases = [np.random.randn(y, 1) for y in self.size[1:]]\n",
	" self.weights = [np.random.randn(y, x)/np.sqrt(x)\n",
	" for x, y in zip(self.size[:-1], self.size[1:])]\n",
	" self.cost = cost\n",
	" \n",
	" \n",
	" def forward_propagation(self, a):\n",
	" \"\"\"The neuron calculation formula : Wa+b\"\"\"\n",
	" for w, b in zip(self.weights, self.biases):\n",
	" a = sigmoid(np.dot(w, a) + b)\n",
	" return a\n",
	"\n",
	" \n",
	" def back_propagation(self, x, y):\n",
	" \"\"\"\n",
	" return : (del_w, del_b) , the gradient for the cost function\n",
	" del_w and del_b are layer-by-layer lists of numpy arrays.\n",
	" Warning : negative indices would be heavily utilized\n",
	" \"\"\"\n",
	" del_w = [np.zeros(w.shape) for w in self.weights]\n",
	" del_b = [np.zeroes(b.shape) for b in self.biases]\n",
	" # forward prop\n",
	" curr_activation = x\n",
	" activations = [x] # store all activations by layer, remember chain-rule \n",
	" z_lis = [] # store all z values for layers, remember tree structure from notes\n",
	" for w, b in zip(self.weights, self.biases):\n",
	" z = np.dot(w, curr_activation) + b\n",
	" curr_activation = sigmoid(z)\n",
	" z_lis.append(z)\n",
	" activations.append(curr_activation)\n",
	" # backward pass : calculating cost by taking last elems of a, z lists\n",
	" delta = (self.cost).delta(activations[-1], y, z_lis[-1])\n",
	" del_w[-1] = np.dot(delta, activations[-2].transpose())\n",
	" del_b[-1] = delta\n",
	" \n",
	" # going back all layers\n",
	" for l in range(2, self.n_layers):\n",
	" z = z_lis[-l]\n",
	" del_sigmoid = sigmoid_prime(z)\n",
	" delta = np.dot(self.weights[-l+1].transpose(), delta) * del_sigmoid\n",
	" del_w[-l] = np.dot(delta, activations[-l-1].transpose())\n",
	" del_b[-l] = delta\n",
	" return (del_w, del_b)\n",
	" \n",
	" \n",
	" def initialize_weight(self):\n",
	" \"\"\"\n",
	" initialize weights using Gaussian Distribution with mean 0 and SD 1\n",
	" over sqrt of number of weights connecting same neuron\n",
	" initialize biases using Gaussian Distribution with mean 0 and SD 1\n",
	" \"\"\"\n",
	" self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in\n",
	" zip(self.size[:-1], self.size[:-1])]\n",
	" self.biases = [np.random.randn(y, 1) for y in self.size[1:]]\n",
	" \n",
	" \n",
	" def trainer(self, training_data, epochs, m_bs, eta, lmbda, eval_data=None, \n",
	" print_eval_cost=False, print_eval_acc=False, print_train_cost=False, \n",
	" print_train_acc=False):\n",
	" \"\"\"\n",
	" Train the neural network using mini-batch stochastic gradient\n",
	" descent. \n",
	" \"\"\"\n",
	" if eval_data:\n",
	" n_data = sum(1 for _ in eval_data)\n",
	" n = sum(1 for _ in training_data)\n",
	" eval_cost, eval_acc = [], []\n",
	" train_cost, train_acc = [], []\n",
	" for c in range(epochs):\n",
	" random.shuffle(training_data)\n",
	" mini_batches = [training_data[k:k+m_bs] for k in\n",
	" range(0, n, m_bs)]\n",
	" for mini_batch in mini_batches:\n",
	" self.update_mini_batch(mini_batch, eta, lmbda, len(training_data))\n",
	" print('Training : Epoch % complete.' % c)\n",
	" \n",
	" if print_train_cost:\n",
	" acc = self.accuracy(training_data, convert=True)\n",
	" train_acc.append(acc)\n",
	" print('Accuracy on training data : {} / {}'.format(acc, n))\n",
	" if print_train_cost:\n",
	" cost = self.total_cost(training_data, lmbda)\n",
	" train_cost.append(cost)\n",
	" print('Cost on training data : {}'.format(cost))\n",
	" if print_eval_acc:\n",
	" acc = self.accuracy(eval_data)\n",
	" eval_acc.append(acc)\n",
	" print('Accuracy on training data : {} / {}'.format(acc, n))\n",
	" if print_eval_cost:\n",
	" cost = self.total_cost(eval_data, lmbda, convert=True)\n",
	" train_cost.append(cost)\n",
	" print('Cost on training data : {}'.format(cost))\n",
	" print()\n",
	" return (eval_cost, eval_acc, train_cost, train_acc)\n",
	" \n",
	" def update_mini_batch(self, mini_batch, eta, lmbda, n):\n",
	" \"\"\"Update the network's weights and biases by applying gradient\n",
	" descent using backpropagation to a single mini batch.mini_batch is a list of tuples ``(x, y)``, ``eta`` is the\n",
	" learning rate, lmbda is the regularization parameter, and\n",
	" n is the total size of the training data set.\n",
	" \"\"\"\n",
	" nabla_b = [np.zeros(b.shape) for b in self.biases]\n",
	" nabla_w = [np.zeros(w.shape) for w in self.weights]\n",
	" for x, y in mini_batch:\n",
	" delta_nabla_b, delta_nabla_w = self.backprop(x, y)\n",
	" nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]\n",
	" nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]\n",
	" self.weights = [(1-eta(lmbda/n))w-(eta/len(mini_batch))*nw\n",
	" for w, nw in zip(self.weights, nabla_w)]\n",
	" self.biases = [b-(eta/len(mini_batch))*nb\n",
	" for b, nb in zip(self.biases, nabla_b)]\n",
	" \n",
	" \n",
	" def accuracy(self, data, convert=False):\n",
	" \"\"\"Return the number of inputs in ``data`` for which the neural\n",
	" network outputs the correct result. The neural network's\n",
	" output is assumed to be the index of whichever neuron in the\n",
	" final layer has the highest activation.\n",
	" The flag ``convert`` should be set to False if the data set is\n",
	" validation or test data (the usual case), and to True if the\n",
	" data set is the training data. \n",
	" \"\"\"\n",
	" if convert:\n",
	" results = [(np.argmax(self.feedforward(x)), np.argmax(y))\n",
	" for (x, y) in data]\n",
	" else:\n",
	" results = [(np.argmax(self.feedforward(x)), y)\n",
	" for (x, y) in data]\n",
	" return sum(int(x == y) for (x, y) in results)\n",
	"\n",
	" \n",
	" def total_cost(self, data, lmbda, convert=False):\n",
	" \"\"\"Return the total cost for the data set ``data``. \n",
	" \"\"\"\n",
	" cost = 0.0\n",
	" for x, y in data:\n",
	" a = self.feedforward(x)\n",
	" if convert: y = vectorized_result(y)\n",
	" cost += self.cost.fn(a, y)/len(data)\n",
	" cost += 0.5(lmbda/len(data))sum(\n",
	" np.linalg.norm(w)**2 for w in self.weights)\n",
	" return cost\n",
	"\n",
	" \n",
	" def save(self, filename):\n",
	" \"\"\"Save the neural network to the file ``filename``.\"\"\"\n",
	" data = {\"size\": self.size,\n",
	" \"weights\": [w.tolist() for w in self.weights],\n",
	" \"biases\": [b.tolist() for b in self.biases],\n",
	" \"cost\": str(self.cost.__name__)}\n",
	" f = open(filename, \"w\")\n",
	" json.dump(data, f)\n",
	" f.close()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Loading Data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 41,
	"metadata": {},
	"outputs": [],
	"source": [
	"# this python code to load MNIST Data is by Michael Nielsen.\n",
	"import _pickle as cPickle\n",
	"import gzip\n",
	"\n",
	"# Third-party libraries\n",
	"import numpy as np\n",
	"\n",
	"def load_data():\n",
	" \"\"\"Return the MNIST data as a tuple containing the training data,\n",
	" the validation data, and the test data.\n",
	" The ``training_data`` is returned as a tuple with two entries.\n",
	" The first entry contains the actual training images. This is a\n",
	" numpy ndarray with 50,000 entries. Each entry is, in turn, a\n",
	" numpy ndarray with 784 values, representing the 28 * 28 = 784\n",
	" pixels in a single MNIST image.\n",
	" The second entry in the ``training_data`` tuple is a numpy ndarray\n",
	" containing 50,000 entries. Those entries are just the digit\n",
	" values (0...9) for the corresponding images contained in the first\n",
	" entry of the tuple.\n",
	" The ``validation_data`` and ``test_data`` are similar, except\n",
	" each contains only 10,000 images.\n",
	" This is a nice data format, but for use in neural networks it's\n",
	" helpful to modify the format of the ``training_data`` a little.\n",
	" That's done in the wrapper function ``load_data_wrapper()``, see\n",
	" below.\n",
	" \"\"\"\n",
	" f = gzip.open('mnist.pkl.gz', 'rb')\n",
	" training_data, validation_data, test_data = cPickle.load(f, encoding='latin1')\n",
	" f.close()\n",
	" return (training_data, validation_data, test_data)\n",
	"\n",
	"def load_data_wrapper():\n",
	" \"\"\"Return a tuple containing ``(training_data, validation_data,\n",
	" test_data)``. Based on ``load_data``, but the format is more\n",
	" convenient for use in our implementation of neural networks.\n",
	" In particular, ``training_data`` is a list containing 50,000\n",
	" 2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray\n",
	" containing the input image. ``y`` is a 10-dimensional\n",
	" numpy.ndarray representing the unit vector corresponding to the\n",
	" correct digit for ``x``.\n",
	" ``validation_data`` and ``test_data`` are lists containing 10,000\n",
	" 2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional\n",
	" numpy.ndarry containing the input image, and ``y`` is the\n",
	" corresponding classification, i.e., the digit values (integers)\n",
	" corresponding to ``x``.\n",
	" Obviously, this means we're using slightly different formats for\n",
	" the training data and the validation / test data. These formats\n",
	" turn out to be the most convenient for use in our neural network\n",
	" code.\"\"\"\n",
	" tr_d, va_d, te_d = load_data()\n",
	" training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]\n",
	" training_results = [vectorized_result(y) for y in tr_d[1]]\n",
	" training_data = zip(training_inputs, training_results)\n",
	" validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]\n",
	" validation_data = zip(validation_inputs, va_d[1])\n",
	" test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]\n",
	" test_data = zip(test_inputs, te_d[1])\n",
	" return (training_data, validation_data, test_data)\n",
	"\n",
	"def vectorized_result(j):\n",
	" \"\"\"Return a 10-dimensional unit vector with a 1.0 in the jth\n",
	" position and zeroes elsewhere. This is used to convert a digit\n",
	" (0...9) into a corresponding desired output from the neural\n",
	" network.\"\"\"\n",
	" e = np.zeros((10, 1))\n",
	" e[j] = 1.0\n",
	" return e"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 42,
	"metadata": {},
	"outputs": [],
	"source": [
	"training_data, validation_data, test_data = load_data_wrapper()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 43,
	"metadata": {},
	"outputs": [],
	"source": [
	"learner = NN([784, 30, 10])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 44,
	"metadata": {},
	"outputs": [
	{
	"ename": "TypeError",
	"evalue": "object of type 'zip' has no len()",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
	"\u001b[0;32m<ipython-input-44-742e24694131>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mlearner\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtraining_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m30\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3.0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0.1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meval_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtest_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
	"\u001b[0;32m<ipython-input-40-6f3c6ca52295>\u001b[0m in \u001b[0;36mtrainer\u001b[0;34m(self, training_data, epochs, m_bs, eta, lmbda, eval_data, print_eval_cost, print_eval_acc, print_train_cost, print_train_acc)\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0mtrain_cost\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_acc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mepochs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 81\u001b[0;31m \u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshuffle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtraining_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 82\u001b[0m mini_batches = [training_data[k:k+m_bs] for k in\n\u001b[1;32m 83\u001b[0m range(0, n, m_bs)]\n",
	"\u001b[0;32m~/anaconda3/envs/fastai/lib/python3.7/random.py\u001b[0m in \u001b[0;36mshuffle\u001b[0;34m(self, x, random)\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrandom\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0mrandbelow\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_randbelow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 275\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mreversed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 276\u001b[0m \u001b[0;31m# pick an element in x[:i+1] with which to exchange x[i]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 277\u001b[0m \u001b[0mj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrandbelow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;31mTypeError\u001b[0m: object of type 'zip' has no len()"
	]
	}
	],
	"source": [
	"learner.trainer(training_data, 30, 10, 3.0, 0.1, eval_data=test_data)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"\|"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}