ilyarudyak/softmax.py

## softmax.py
from builtins import range
import numpy as np
from random import shuffle
# from past.builtins import xrange


def softmax_loss_naive(W, X, y, reg=0.0):
    """
    Softmax loss function, naive implementation (with loops)

    Inputs have dimension D, there are C classes, and we operate on minibatches
    of N examples.

    Inputs:
    - W: A numpy array of shape (D, C) containing weights.
    - X: A numpy array of shape (N, D) containing a minibatch of data.
    - y: A numpy array of shape (N,) containing training labels; y[i] = c means
      that X[i] has label c, where 0 <= c < C.
    - reg: (float) regularization strength

    Returns a tuple of:
    - loss as single float
    - gradient with respect to weights W; an array of same shape as W
    """
    # Initialize the loss and gradient to zero.
    loss = 0.0
    dW = np.zeros_like(W)

    #############################################################################
    # TODO: Compute the softmax loss and its gradient using explicit loops.     #
    # Store the loss in loss and the gradient in dW. If you are not careful     #
    # here, it is easy to run into numeric instability. Don't forget the        #
    # regularization!                                                           #
    #############################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    D, C = W.shape
    N, D = X.shape
    F = X @ W ## N x C
    S = np.exp(F) / np.sum(np.exp(F), axis=1, keepdims=True)
    for i in range(N):
        s = S[i, :]
        loss -= np.log(s[y[i]])
    loss /= N

    dS = np.copy(S)
    for i in range(N):
        dS[i, y[i]] -= 1
    dW = X.T @ dS
    dW /= N
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    return loss, dW


def softmax_loss_vectorized(W, X, y, reg=0.0):
    """
    Softmax loss function, vectorized version.

    Inputs and outputs are t he same as softmax_loss_naive.
    """
    # Initialize the loss and gradient to zero.
    loss = 0.0
    dW = np.zeros_like(W)

    #############################################################################
    # TODO: Compute the softmax loss and its gradient using no explicit loops.  #
    # Store the loss in loss and the gradient in dW. If you are not careful     #
    # here, it is easy to run into numeric instability. Don't forget the        #
    # regularization!                                                           #
    #############################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    D, C = W.shape
    N, D = X.shape

    F = X @ W
    F -= np.max(F, axis=1, keepdims=True)
    S = np.exp(F) / np.sum(np.exp(F), axis=1, keepdims=True)
    loss -= np.mean(np.log(S[np.arange(N), y]))

    dS = np.copy(S)
    dS[np.arange(N), y] -= 1
    dW = X.T @ dS
    dW /= N

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    return loss, dW

## softmax_debugging_v1.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "loose-authorization",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import torch\n",
    "from cs231n.classifiers.softmax import *\n",
    "from cs231n.gradient_check import grad_check_sparse\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "swedish-cathedral",
   "metadata": {},
   "source": [
    "So what are we going to do here - implement softmax in `numpy` (both forward pass and backward pass) and compare results with:\n",
    "- numerical computations;\n",
    "- `pytorch` computetions;"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "overall-scottish",
   "metadata": {},
   "source": [
    "## 00 - generate data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "greenhouse-orientation",
   "metadata": {},
   "source": [
    "Inputs have dimension `D`, there are `C` classes, and we operate on minibatches of `N` examples. \n",
    "\n",
    "Inputs:\n",
    "- `W`: A numpy array of shape `(D, C)` containing weights;\n",
    "- `X`: A numpy array of shape `(N, D)` containing a minibatch of data;\n",
    "- `y`: A numpy array of shape `(N,)` containing training labels;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "integral-macintosh",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<torch._C.Generator at 0x11cd1bab0>"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "torch.manual_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "rational-block",
   "metadata": {},
   "outputs": [],
   "source": [
    "N, D, C = 10, 2, 3\n",
    "\n",
    "W_tens = torch.randn([D, C], requires_grad=True)\n",
    "X_tens = torch.randn([N, D], requires_grad=True)\n",
    "y_tens = torch.randint(low=0, high=C, size=(N,))\n",
    "\n",
    "W = W_tens.detach().numpy()\n",
    "X = X_tens.detach().numpy()\n",
    "y = y_tens.detach().numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "colonial-christmas",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[ 0.3367,  0.1288,  0.2345],\n",
       "        [ 0.2303, -1.1229, -0.1863]], requires_grad=True)"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "W_tens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "based-drove",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.33669037,  0.1288094 ,  0.23446237],\n",
       "       [ 0.23033303, -1.1228564 , -0.18632829]], dtype=float32)"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "W"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "annual-cable",
   "metadata": {},
   "outputs": [],
   "source": [
    "F_tens = torch.mm(X_tens, W_tens)\n",
    "F = F_tens.detach().numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "crucial-battery",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[-0.3619,  1.3247,  0.1669],\n",
       "        [-0.0410, -0.6456, -0.2089],\n",
       "        [-0.6669,  0.7999, -0.1623]], grad_fn=<SliceBackward>)"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "F_tens[:3, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "accompanied-senegal",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.36187768,  1.3246738 ,  0.16691756],\n",
       "       [-0.04102264, -0.6455607 , -0.2089102 ],\n",
       "       [-0.6668808 ,  0.799914  , -0.16231792]], dtype=float32)"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "F[:3, :]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "reverse-borough",
   "metadata": {},
   "source": [
    "## 01 - forward pass"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "angry-correlation",
   "metadata": {},
   "source": [
    "First let's try our naive and vectorized versions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "architectural-nerve",
   "metadata": {},
   "outputs": [],
   "source": [
    "loss_naive, _ = softmax_loss_naive(W, X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "divine-cornell",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.4709\n"
     ]
    }
   ],
   "source": [
    "print(f'{loss_naive:.4f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "perceived-closing",
   "metadata": {},
   "outputs": [],
   "source": [
    "loss_vec, _ = softmax_loss_vectorized(W, X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "effective-phoenix",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.4709\n"
     ]
    }
   ],
   "source": [
    "print(f'{loss_vec:.4f}')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "educational-portugal",
   "metadata": {},
   "source": [
    "Now let's try `torch` loss. First we'll try it with a class `CrossEntropyLoss` and then with a function. Looks like everything works just fine. That's a very important exercise to understand how `pytorch` actually works. \n",
    "\n",
    "We may read in [documentation](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html) the formula for it that is exactly the same we use in cs231n:\n",
    "\n",
    "$$loss(x, class) = −log\\left(\\frac{exp(x_{class})}{\\sum_j{exp(x_j)}}\\right)$$\n",
    "The losses are averaged across observations for each minibatch, again, like we do in cs231n."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "young-february",
   "metadata": {},
   "outputs": [],
   "source": [
    "loss_object = torch.nn.CrossEntropyLoss()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "behind-convention",
   "metadata": {},
   "outputs": [],
   "source": [
    "loss_torch = loss_object(F_tens, y_tens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "graphic-brook",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor(1.4709, grad_fn=<NllLossBackward>)"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loss_torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "tracked-recommendation",
   "metadata": {},
   "outputs": [],
   "source": [
    "# loss_torch_fun = torch.nn.functional.cross_entropy(F_tens, y_tens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "comprehensive-steam",
   "metadata": {},
   "outputs": [],
   "source": [
    "# loss_torch_fun"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "worth-festival",
   "metadata": {},
   "source": [
    "## 02 - backward pass"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "atomic-timeline",
   "metadata": {},
   "source": [
    "The next question - backward pass. It's a bit more involved but let's start from the formula for softmax derivative. We use the following [tutorial from cs231n](https://cs231n.github.io/neural-networks-case-study/#grad)."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "hidden-throat",
   "metadata": {},
   "source": [
    "### 02-1 math for grad"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "celtic-patent",
   "metadata": {},
   "source": [
    "#### 02-1-1 softmax derivative"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "moderate-movie",
   "metadata": {},
   "source": [
    "Softmax is the vector function: \n",
    "$$s: \\mathbb{R^n} \\rightarrow \\mathbb{R^n}$$ \n",
    "$$s(f) = \\begin{bmatrix} \\frac{e^{f_1}}{\\sum_j{e^{f_j}}} \\\\ ... \\\\\\frac{e^{f_1}}{\\sum_j{e^{f_j}}} \\end{bmatrix}$$\n",
    "so its Jacobian matrix has the size $n \\times n$."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dying-control",
   "metadata": {},
   "source": [
    "Jacobian is computed as follows:\n",
    "\n",
    "$$\\frac{\\partial s_i}{\\partial f_j} = \\delta_{ij} s_i - s_i s_j$$"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "confused-devon",
   "metadata": {},
   "source": [
    "Now let's compute derivative with respect to $W$. The only non-zero term: \n",
    "\n",
    "$$\\frac{\\partial f_k}{\\partial W_{jk}} = x_j$$"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "printable-miniature",
   "metadata": {},
   "source": [
    "$$\\frac{\\partial s_i}{\\partial W_{jk}} = \\frac{\\partial s_i}{\\partial f_1} \\frac{\\partial f_1}{\\partial W_{jk}} + ... = \\frac{\\partial s_i}{\\partial f_k} x_j = (\\delta_{ik} s_i - s_i s_k) x_j$$"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "grateful-recording",
   "metadata": {},
   "source": [
    "To compute this using Jacobians - see [this post](https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/)."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "correct-rings",
   "metadata": {},
   "source": [
    "#### 02-1-2 cross-entropy for 1 example "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "mechanical-corrections",
   "metadata": {},
   "source": [
    "Now what's the cross-entropy loss for 1 training example? \n",
    "\n",
    "$$L_i = -log \\left( \\frac{exp(f_{y_i})}{\\exp(\\sum_j{f_j})} \\right) = -log(s_{y_i})$$\n",
    "\n",
    "Basically we have to take negative log of the right component of softmax: $s_{y_i}$. "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "designed-creator",
   "metadata": {},
   "source": [
    "Let's compute its derivative. First of all its a vector from $\\mathbb{R}^n$. We have:\n",
    "\n",
    "$$\\frac{\\partial L_i}{\\partial f_j} = \n",
    "-\\frac{1}{s_{y_i}} \\frac{\\partial s_{y_i}}{\\partial f_j}=\n",
    "-\\frac{1}{s_{y_i}} (\\delta_{y_ij} s_{y_i} - s_{y_i}s_j) = \n",
    "s_j - \\delta_{y_ij} $$\n",
    "\n",
    "$$\\nabla_f L_i = s - \\mathbb{1}(y_i = j)$$"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "intelligent-uruguay",
   "metadata": {},
   "source": [
    "Now let's compute derivative with respect to $W$. We are still talking about only one training examples."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "offensive-rebound",
   "metadata": {},
   "source": [
    "$$\n",
    "\\frac{\\partial L_i}{\\partial W_{jk}} = \n",
    "\\frac{\\partial L_i}{\\partial f_1} \\frac{\\partial f_1}{\\partial W_{jk}} + ... = \n",
    "\\frac{\\partial L_i}{\\partial f_k} \\frac{\\partial f_k}{\\partial W_{jk}} =\n",
    "\\frac{\\partial L_i}{\\partial f_k} x_j = \n",
    "(s_k - \\delta_{y_ik}) x_j\n",
    "$$"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "hydraulic-turning",
   "metadata": {},
   "source": [
    "#### 02-1-3 cross-entropy for $N$ examples"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "portable-minority",
   "metadata": {},
   "source": [
    "Suppose now we have 2 examples (for simplicity). Let's sketch a proof that:\n",
    "\n",
    "$$\\nabla_W L = 1/2\\  X^T \\tilde{S}$$\n",
    "\n",
    "where $\\tilde{S}$ is $S$ with subtracted $1$s. Suppose even that we don't need to subtract those $1$s."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "piano-adrian",
   "metadata": {},
   "source": [
    "$$\n",
    "X = \\begin{bmatrix} x^{(1)T} \\\\ x^{(2)T} \\end{bmatrix} \\ (2, D); \\ \n",
    "X^T = \\begin{bmatrix} x^{(1)} \\ x^{(2)} \\end{bmatrix} \\ (D, 2)\n",
    "$$\n",
    "\n",
    "$$\n",
    "S = \n",
    "\\begin{bmatrix} s(f^{(1)}) \\\\ s(f^{(2)}) \\end{bmatrix} = \n",
    "\\begin{bmatrix} s^{(1)} \\\\ s^{(2)} \\end{bmatrix} \\ (2, C)\n",
    "$$"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "digital-editor",
   "metadata": {},
   "source": [
    "So first of all dimensions of $X^T S$ is $(D, C)$ which is dimension of $W$. So dimensions are correct. Let's now look at the product."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "separate-township",
   "metadata": {},
   "source": [
    "$$\n",
    "X^T S = x^{(1)} s^{(1)} + x^{(2)} s^{(2)}\n",
    "$$"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "constant-cache",
   "metadata": {},
   "source": [
    "Here's we have a product of a column vector and a row vector with $ij$ elements that's just of the form $x_i s_j$:\n",
    "\n",
    "$$\n",
    "x^{(1)} s^{(1)} = \n",
    "\\begin{bmatrix} | \\\\ x^{(1)} \\\\ | \\end{bmatrix}\n",
    "\\begin{bmatrix} - \\  s^{(1)} \\ - \\end{bmatrix}\n",
    "$$\n",
    "\n",
    "$$\n",
    "(x^{(1)} s^{(1)})_{ij} =\n",
    "x^{(1)}_i s^{(1)}_j\n",
    "$$"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "suspected-eight",
   "metadata": {},
   "source": [
    "Finally let's have a look at the derivative of our loss $L = (L_1 + L_2) / 2$:\n",
    "\n",
    "$$\n",
    "\\frac{\\partial L_1}{\\partial W_{ij}} = x^{(1)}_i s^{(1)}_j\n",
    "\\frac{\\partial L_2}{\\partial W_{ij}} = x^{(2)}_i s^{(2)}_j\n",
    "\\frac{\\partial L}{\\partial W_{ij}} = (x^{(1)}_i s^{(1)}_j + x^{(2)}_i s^{(2)}_j) / 2\n",
    "$$"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "superior-bargain",
   "metadata": {},
   "source": [
    "So we may see that:\n",
    "\n",
    "$$\n",
    "(1/2) (X^T S)_{ij} = \\frac{\\partial L}{\\partial W_{ij}}\n",
    "$$"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "earned-revolution",
   "metadata": {},
   "source": [
    "This concludes the sketch of our proof."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "three-edmonton",
   "metadata": {},
   "source": [
    "### 02-2 code for grad"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "governmental-mistress",
   "metadata": {},
   "outputs": [],
   "source": [
    "_, dW = softmax_loss_naive(W, X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "extra-rainbow",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.08522899,  0.11266414, -0.02743511],\n",
       "       [ 0.35102898, -0.41072887,  0.05969987]], dtype=float32)"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dW"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "threatened-heavy",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((2, 3), (2, 3))"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "W.shape, dW.shape "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "round-milwaukee",
   "metadata": {},
   "outputs": [],
   "source": [
    "_, dW = softmax_loss_vectorized(W, X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "young-agenda",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.08522901,  0.11266414, -0.02743511],\n",
       "       [ 0.351029  , -0.41072887,  0.05969986]], dtype=float32)"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dW"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "spare-latest",
   "metadata": {},
   "source": [
    "Let's compare this with automatic gradient. Looks like our function works."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "authentic-instrument",
   "metadata": {},
   "outputs": [],
   "source": [
    "loss_torch.backward()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "loaded-yugoslavia",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[-0.0852,  0.1127, -0.0274],\n",
       "        [ 0.3510, -0.4107,  0.0597]])"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "W_tens.grad"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "asian-break",
   "metadata": {},
   "source": [
    "### 02-3 numerical grad"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "biblical-vulnerability",
   "metadata": {},
   "source": [
    "Finally let's try to use numerical gradient. Results again are quite close."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "automotive-beads",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n",
      "numerical: 0.113249 analytic: 0.112664, relative error: 2.588095e-03\n",
      "numerical: 0.113249 analytic: 0.112664, relative error: 2.588095e-03\n",
      "numerical: -0.029802 analytic: -0.027435, relative error: 4.135772e-02\n",
      "numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n",
      "numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n",
      "numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n",
      "numerical: -0.029802 analytic: -0.027435, relative error: 4.135772e-02\n",
      "numerical: -0.411272 analytic: -0.410729, relative error: 6.607987e-04\n",
      "numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n"
     ]
    }
   ],
   "source": [
    "f = lambda w: softmax_loss_vectorized(w, X, y)[0]\n",
    "grad_numerical = grad_check_sparse(f, W, dW, 10)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "searching-grade",
   "metadata": {},
   "source": [
    "Let's have a look at how this numerical gradient is actually computed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "becoming-joseph",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "lovely-marks",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "advised-unknown",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "reported-appearance",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "tight-differential",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "departmental-victory",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	from builtins import range
	import numpy as np
	from random import shuffle
	# from past.builtins import xrange


	def softmax_loss_naive(W, X, y, reg=0.0):
	"""
	Softmax loss function, naive implementation (with loops)

	Inputs have dimension D, there are C classes, and we operate on minibatches
	of N examples.

	Inputs:
	- W: A numpy array of shape (D, C) containing weights.
	- X: A numpy array of shape (N, D) containing a minibatch of data.
	- y: A numpy array of shape (N,) containing training labels; y[i] = c means
	that X[i] has label c, where 0 <= c < C.
	- reg: (float) regularization strength

	Returns a tuple of:
	- loss as single float
	- gradient with respect to weights W; an array of same shape as W
	"""
	# Initialize the loss and gradient to zero.
	loss = 0.0
	dW = np.zeros_like(W)

	#############################################################################
	# TODO: Compute the softmax loss and its gradient using explicit loops. #
	# Store the loss in loss and the gradient in dW. If you are not careful #
	# here, it is easy to run into numeric instability. Don't forget the #
	# regularization! #
	#############################################################################
	# ***START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***
	D, C = W.shape
	N, D = X.shape
	F = X @ W ## N x C
	S = np.exp(F) / np.sum(np.exp(F), axis=1, keepdims=True)
	for i in range(N):
	s = S[i, :]
	loss -= np.log(s[y[i]])
	loss /= N

	dS = np.copy(S)
	for i in range(N):
	dS[i, y[i]] -= 1
	dW = X.T @ dS
	dW /= N
	# ***END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***

	return loss, dW


	def softmax_loss_vectorized(W, X, y, reg=0.0):
	"""
	Softmax loss function, vectorized version.

	Inputs and outputs are t he same as softmax_loss_naive.
	"""
	# Initialize the loss and gradient to zero.
	loss = 0.0
	dW = np.zeros_like(W)

	#############################################################################
	# TODO: Compute the softmax loss and its gradient using no explicit loops. #
	# Store the loss in loss and the gradient in dW. If you are not careful #
	# here, it is easy to run into numeric instability. Don't forget the #
	# regularization! #
	#############################################################################
	# ***START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***

	D, C = W.shape
	N, D = X.shape

	F = X @ W
	F -= np.max(F, axis=1, keepdims=True)
	S = np.exp(F) / np.sum(np.exp(F), axis=1, keepdims=True)
	loss -= np.mean(np.log(S[np.arange(N), y]))

	dS = np.copy(S)
	dS[np.arange(N), y] -= 1
	dW = X.T @ dS
	dW /= N

	# ***END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***

	return loss, dW
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 86,
	"id": "loose-authorization",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The autoreload extension is already loaded. To reload it, use:\n",
	" %reload_ext autoreload\n"
	]
	}
	],
	"source": [
	"import numpy as np\n",
	"import torch\n",
	"from cs231n.classifiers.softmax import *\n",
	"from cs231n.gradient_check import grad_check_sparse\n",
	"\n",
	"%load_ext autoreload\n",
	"%autoreload 2"
	]
	},
	{
	"cell_type": "markdown",
	"id": "swedish-cathedral",
	"metadata": {},
	"source": [
	"So what are we going to do here - implement softmax in `numpy` (both forward pass and backward pass) and compare results with:\n",
	"- numerical computations;\n",
	"- `pytorch` computetions;"
	]
	},
	{
	"cell_type": "markdown",
	"id": "overall-scottish",
	"metadata": {},
	"source": [
	"## 00 - generate data"
	]
	},
	{
	"cell_type": "markdown",
	"id": "greenhouse-orientation",
	"metadata": {},
	"source": [
	"Inputs have dimension `D`, there are `C` classes, and we operate on minibatches of `N` examples. \n",
	"\n",
	"Inputs:\n",
	"- `W`: A numpy array of shape `(D, C)` containing weights;\n",
	"- `X`: A numpy array of shape `(N, D)` containing a minibatch of data;\n",
	"- `y`: A numpy array of shape `(N,)` containing training labels;"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 58,
	"id": "integral-macintosh",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<torch._C.Generator at 0x11cd1bab0>"
	]
	},
	"execution_count": 58,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"torch.manual_seed(42)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 59,
	"id": "rational-block",
	"metadata": {},
	"outputs": [],
	"source": [
	"N, D, C = 10, 2, 3\n",
	"\n",
	"W_tens = torch.randn([D, C], requires_grad=True)\n",
	"X_tens = torch.randn([N, D], requires_grad=True)\n",
	"y_tens = torch.randint(low=0, high=C, size=(N,))\n",
	"\n",
	"W = W_tens.detach().numpy()\n",
	"X = X_tens.detach().numpy()\n",
	"y = y_tens.detach().numpy()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 60,
	"id": "colonial-christmas",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"tensor([[ 0.3367, 0.1288, 0.2345],\n",
	" [ 0.2303, -1.1229, -0.1863]], requires_grad=True)"
	]
	},
	"execution_count": 60,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"W_tens"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 61,
	"id": "based-drove",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[ 0.33669037, 0.1288094 , 0.23446237],\n",
	" [ 0.23033303, -1.1228564 , -0.18632829]], dtype=float32)"
	]
	},
	"execution_count": 61,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"W"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 62,
	"id": "annual-cable",
	"metadata": {},
	"outputs": [],
	"source": [
	"F_tens = torch.mm(X_tens, W_tens)\n",
	"F = F_tens.detach().numpy()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 63,
	"id": "crucial-battery",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"tensor([[-0.3619, 1.3247, 0.1669],\n",
	" [-0.0410, -0.6456, -0.2089],\n",
	" [-0.6669, 0.7999, -0.1623]], grad_fn=<SliceBackward>)"
	]
	},
	"execution_count": 63,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"F_tens[:3, :]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 64,
	"id": "accompanied-senegal",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[-0.36187768, 1.3246738 , 0.16691756],\n",
	" [-0.04102264, -0.6455607 , -0.2089102 ],\n",
	" [-0.6668808 , 0.799914 , -0.16231792]], dtype=float32)"
	]
	},
	"execution_count": 64,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"F[:3, :]"
	]
	},
	{
	"cell_type": "markdown",
	"id": "reverse-borough",
	"metadata": {},
	"source": [
	"## 01 - forward pass"
	]
	},
	{
	"cell_type": "markdown",
	"id": "angry-correlation",
	"metadata": {},
	"source": [
	"First let's try our naive and vectorized versions."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 65,
	"id": "architectural-nerve",
	"metadata": {},
	"outputs": [],
	"source": [
	"loss_naive, _ = softmax_loss_naive(W, X, y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 66,
	"id": "divine-cornell",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1.4709\n"
	]
	}
	],
	"source": [
	"print(f'{loss_naive:.4f}')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 67,
	"id": "perceived-closing",
	"metadata": {},
	"outputs": [],
	"source": [
	"loss_vec, _ = softmax_loss_vectorized(W, X, y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 68,
	"id": "effective-phoenix",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1.4709\n"
	]
	}
	],
	"source": [
	"print(f'{loss_vec:.4f}')"
	]
	},
	{
	"cell_type": "markdown",
	"id": "educational-portugal",
	"metadata": {},
	"source": [
	"Now let's try `torch` loss. First we'll try it with a class `CrossEntropyLoss` and then with a function. Looks like everything works just fine. That's a very important exercise to understand how `pytorch` actually works. \n",
	"\n",
	"We may read in [documentation](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html) the formula for it that is exactly the same we use in cs231n:\n",
	"\n",
	"$$loss(x, class) = −log\\left(\\frac{exp(x_{class})}{\\sum_j{exp(x_j)}}\\right)$$\n",
	"The losses are averaged across observations for each minibatch, again, like we do in cs231n."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 69,
	"id": "young-february",
	"metadata": {},
	"outputs": [],
	"source": [
	"loss_object = torch.nn.CrossEntropyLoss()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 70,
	"id": "behind-convention",
	"metadata": {},
	"outputs": [],
	"source": [
	"loss_torch = loss_object(F_tens, y_tens)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 71,
	"id": "graphic-brook",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"tensor(1.4709, grad_fn=<NllLossBackward>)"
	]
	},
	"execution_count": 71,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"loss_torch"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 72,
	"id": "tracked-recommendation",
	"metadata": {},
	"outputs": [],
	"source": [
	"# loss_torch_fun = torch.nn.functional.cross_entropy(F_tens, y_tens)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 73,
	"id": "comprehensive-steam",
	"metadata": {},
	"outputs": [],
	"source": [
	"# loss_torch_fun"
	]
	},
	{
	"cell_type": "markdown",
	"id": "worth-festival",
	"metadata": {},
	"source": [
	"## 02 - backward pass"
	]
	},
	{
	"cell_type": "markdown",
	"id": "atomic-timeline",
	"metadata": {},
	"source": [
	"The next question - backward pass. It's a bit more involved but let's start from the formula for softmax derivative. We use the following [tutorial from cs231n](https://cs231n.github.io/neural-networks-case-study/#grad)."
	]
	},
	{
	"cell_type": "markdown",
	"id": "hidden-throat",
	"metadata": {},
	"source": [
	"### 02-1 math for grad"
	]
	},
	{
	"cell_type": "markdown",
	"id": "celtic-patent",
	"metadata": {},
	"source": [
	"#### 02-1-1 softmax derivative"
	]
	},
	{
	"cell_type": "markdown",
	"id": "moderate-movie",
	"metadata": {},
	"source": [
	"Softmax is the vector function: \n",
	"$$s: \\mathbb{R^n} \\rightarrow \\mathbb{R^n}$$ \n",
	"$$s(f) = \\begin{bmatrix} \\frac{e^{f_1}}{\\sum_j{e^{f_j}}} \\\\ ... \\\\\\frac{e^{f_1}}{\\sum_j{e^{f_j}}} \\end{bmatrix}$$\n",
	"so its Jacobian matrix has the size $n \\times n$."
	]
	},
	{
	"cell_type": "markdown",
	"id": "dying-control",
	"metadata": {},
	"source": [
	"Jacobian is computed as follows:\n",
	"\n",
	"$$\\frac{\\partial s_i}{\\partial f_j} = \\delta_{ij} s_i - s_i s_j$$"
	]
	},
	{
	"cell_type": "markdown",
	"id": "confused-devon",
	"metadata": {},
	"source": [
	"Now let's compute derivative with respect to $W$. The only non-zero term: \n",
	"\n",
	"$$\\frac{\\partial f_k}{\\partial W_{jk}} = x_j$$"
	]
	},
	{
	"cell_type": "markdown",
	"id": "printable-miniature",
	"metadata": {},
	"source": [
	"$$\\frac{\\partial s_i}{\\partial W_{jk}} = \\frac{\\partial s_i}{\\partial f_1} \\frac{\\partial f_1}{\\partial W_{jk}} + ... = \\frac{\\partial s_i}{\\partial f_k} x_j = (\\delta_{ik} s_i - s_i s_k) x_j$$"
	]
	},
	{
	"cell_type": "markdown",
	"id": "grateful-recording",
	"metadata": {},
	"source": [
	"To compute this using Jacobians - see [this post](https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/)."
	]
	},
	{
	"cell_type": "markdown",
	"id": "correct-rings",
	"metadata": {},
	"source": [
	"#### 02-1-2 cross-entropy for 1 example "
	]
	},
	{
	"cell_type": "markdown",
	"id": "mechanical-corrections",
	"metadata": {},
	"source": [
	"Now what's the cross-entropy loss for 1 training example? \n",
	"\n",
	"$$L_i = -log \\left( \\frac{exp(f_{y_i})}{\\exp(\\sum_j{f_j})} \\right) = -log(s_{y_i})$$\n",
	"\n",
	"Basically we have to take negative log of the right component of softmax: $s_{y_i}$. "
	]
	},
	{
	"cell_type": "markdown",
	"id": "designed-creator",
	"metadata": {},
	"source": [
	"Let's compute its derivative. First of all its a vector from $\\mathbb{R}^n$. We have:\n",
	"\n",
	"$$\\frac{\\partial L_i}{\\partial f_j} = \n",
	"-\\frac{1}{s_{y_i}} \\frac{\\partial s_{y_i}}{\\partial f_j}=\n",
	"-\\frac{1}{s_{y_i}} (\\delta_{y_ij} s_{y_i} - s_{y_i}s_j) = \n",
	"s_j - \\delta_{y_ij} $$\n",
	"\n",
	"$$\\nabla_f L_i = s - \\mathbb{1}(y_i = j)$$"
	]
	},
	{
	"cell_type": "markdown",
	"id": "intelligent-uruguay",
	"metadata": {},
	"source": [
	"Now let's compute derivative with respect to $W$. We are still talking about only one training examples."
	]
	},
	{
	"cell_type": "markdown",
	"id": "offensive-rebound",
	"metadata": {},
	"source": [
	"$$\n",
	"\\frac{\\partial L_i}{\\partial W_{jk}} = \n",
	"\\frac{\\partial L_i}{\\partial f_1} \\frac{\\partial f_1}{\\partial W_{jk}} + ... = \n",
	"\\frac{\\partial L_i}{\\partial f_k} \\frac{\\partial f_k}{\\partial W_{jk}} =\n",
	"\\frac{\\partial L_i}{\\partial f_k} x_j = \n",
	"(s_k - \\delta_{y_ik}) x_j\n",
	"$$"
	]
	},
	{
	"cell_type": "markdown",
	"id": "hydraulic-turning",
	"metadata": {},
	"source": [
	"#### 02-1-3 cross-entropy for $N$ examples"
	]
	},
	{
	"cell_type": "markdown",
	"id": "portable-minority",
	"metadata": {},
	"source": [
	"Suppose now we have 2 examples (for simplicity). Let's sketch a proof that:\n",
	"\n",
	"$$\\nabla_W L = 1/2\\ X^T \\tilde{S}$$\n",
	"\n",
	"where $\\tilde{S}$ is $S$ with subtracted $1$s. Suppose even that we don't need to subtract those $1$s."
	]
	},
	{
	"cell_type": "markdown",
	"id": "piano-adrian",
	"metadata": {},
	"source": [
	"$$\n",
	"X = \\begin{bmatrix} x^{(1)T} \\\\ x^{(2)T} \\end{bmatrix} \\ (2, D); \\ \n",
	"X^T = \\begin{bmatrix} x^{(1)} \\ x^{(2)} \\end{bmatrix} \\ (D, 2)\n",
	"$$\n",
	"\n",
	"$$\n",
	"S = \n",
	"\\begin{bmatrix} s(f^{(1)}) \\\\ s(f^{(2)}) \\end{bmatrix} = \n",
	"\\begin{bmatrix} s^{(1)} \\\\ s^{(2)} \\end{bmatrix} \\ (2, C)\n",
	"$$"
	]
	},
	{
	"cell_type": "markdown",
	"id": "digital-editor",
	"metadata": {},
	"source": [
	"So first of all dimensions of $X^T S$ is $(D, C)$ which is dimension of $W$. So dimensions are correct. Let's now look at the product."
	]
	},
	{
	"cell_type": "markdown",
	"id": "separate-township",
	"metadata": {},
	"source": [
	"$$\n",
	"X^T S = x^{(1)} s^{(1)} + x^{(2)} s^{(2)}\n",
	"$$"
	]
	},
	{
	"cell_type": "markdown",
	"id": "constant-cache",
	"metadata": {},
	"source": [
	"Here's we have a product of a column vector and a row vector with $ij$ elements that's just of the form $x_i s_j$:\n",
	"\n",
	"$$\n",
	"x^{(1)} s^{(1)} = \n",
	"\\begin{bmatrix} \| \\\\ x^{(1)} \\\\ \| \\end{bmatrix}\n",
	"\\begin{bmatrix} - \\ s^{(1)} \\ - \\end{bmatrix}\n",
	"$$\n",
	"\n",
	"$$\n",
	"(x^{(1)} s^{(1)})_{ij} =\n",
	"x^{(1)}_i s^{(1)}_j\n",
	"$$"
	]
	},
	{
	"cell_type": "markdown",
	"id": "suspected-eight",
	"metadata": {},
	"source": [
	"Finally let's have a look at the derivative of our loss $L = (L_1 + L_2) / 2$:\n",
	"\n",
	"$$\n",
	"\\frac{\\partial L_1}{\\partial W_{ij}} = x^{(1)}_i s^{(1)}_j\n",
	"\\frac{\\partial L_2}{\\partial W_{ij}} = x^{(2)}_i s^{(2)}_j\n",
	"\\frac{\\partial L}{\\partial W_{ij}} = (x^{(1)}_i s^{(1)}_j + x^{(2)}_i s^{(2)}_j) / 2\n",
	"$$"
	]
	},
	{
	"cell_type": "markdown",
	"id": "superior-bargain",
	"metadata": {},
	"source": [
	"So we may see that:\n",
	"\n",
	"$$\n",
	"(1/2) (X^T S)_{ij} = \\frac{\\partial L}{\\partial W_{ij}}\n",
	"$$"
	]
	},
	{
	"cell_type": "markdown",
	"id": "earned-revolution",
	"metadata": {},
	"source": [
	"This concludes the sketch of our proof."
	]
	},
	{
	"cell_type": "markdown",
	"id": "three-edmonton",
	"metadata": {},
	"source": [
	"### 02-2 code for grad"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 74,
	"id": "governmental-mistress",
	"metadata": {},
	"outputs": [],
	"source": [
	"_, dW = softmax_loss_naive(W, X, y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 75,
	"id": "extra-rainbow",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[-0.08522899, 0.11266414, -0.02743511],\n",
	" [ 0.35102898, -0.41072887, 0.05969987]], dtype=float32)"
	]
	},
	"execution_count": 75,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dW"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 76,
	"id": "threatened-heavy",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"((2, 3), (2, 3))"
	]
	},
	"execution_count": 76,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"W.shape, dW.shape "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 77,
	"id": "round-milwaukee",
	"metadata": {},
	"outputs": [],
	"source": [
	"_, dW = softmax_loss_vectorized(W, X, y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 78,
	"id": "young-agenda",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[-0.08522901, 0.11266414, -0.02743511],\n",
	" [ 0.351029 , -0.41072887, 0.05969986]], dtype=float32)"
	]
	},
	"execution_count": 78,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dW"
	]
	},
	{
	"cell_type": "markdown",
	"id": "spare-latest",
	"metadata": {},
	"source": [
	"Let's compare this with automatic gradient. Looks like our function works."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 79,
	"id": "authentic-instrument",
	"metadata": {},
	"outputs": [],
	"source": [
	"loss_torch.backward()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 80,
	"id": "loaded-yugoslavia",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"tensor([[-0.0852, 0.1127, -0.0274],\n",
	" [ 0.3510, -0.4107, 0.0597]])"
	]
	},
	"execution_count": 80,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"W_tens.grad"
	]
	},
	{
	"cell_type": "markdown",
	"id": "asian-break",
	"metadata": {},
	"source": [
	"### 02-3 numerical grad"
	]
	},
	{
	"cell_type": "markdown",
	"id": "biblical-vulnerability",
	"metadata": {},
	"source": [
	"Finally let's try to use numerical gradient. Results again are quite close."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 90,
	"id": "automotive-beads",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n",
	"numerical: 0.113249 analytic: 0.112664, relative error: 2.588095e-03\n",
	"numerical: 0.113249 analytic: 0.112664, relative error: 2.588095e-03\n",
	"numerical: -0.029802 analytic: -0.027435, relative error: 4.135772e-02\n",
	"numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n",
	"numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n",
	"numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n",
	"numerical: -0.029802 analytic: -0.027435, relative error: 4.135772e-02\n",
	"numerical: -0.411272 analytic: -0.410729, relative error: 6.607987e-04\n",
	"numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n"
	]
	}
	],
	"source": [
	"f = lambda w: softmax_loss_vectorized(w, X, y)[0]\n",
	"grad_numerical = grad_check_sparse(f, W, dW, 10)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "searching-grade",
	"metadata": {},
	"source": [
	"Let's have a look at how this numerical gradient is actually computed."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "becoming-joseph",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "lovely-marks",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "advised-unknown",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "reported-appearance",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "tight-differential",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "departmental-victory",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}