Skip to content

Instantly share code, notes, and snippets.

@ilyarudyak
Created September 9, 2021 18:22
Show Gist options
  • Save ilyarudyak/af3fd05606a3329b89a710cb3de6641a to your computer and use it in GitHub Desktop.
Save ilyarudyak/af3fd05606a3329b89a710cb3de6641a to your computer and use it in GitHub Desktop.
Gist for medium post "Just another post about grad descend"
from builtins import range
import numpy as np
from random import shuffle
# from past.builtins import xrange
def softmax_loss_naive(W, X, y, reg=0.0):
"""
Softmax loss function, naive implementation (with loops)
Inputs have dimension D, there are C classes, and we operate on minibatches
of N examples.
Inputs:
- W: A numpy array of shape (D, C) containing weights.
- X: A numpy array of shape (N, D) containing a minibatch of data.
- y: A numpy array of shape (N,) containing training labels; y[i] = c means
that X[i] has label c, where 0 <= c < C.
- reg: (float) regularization strength
Returns a tuple of:
- loss as single float
- gradient with respect to weights W; an array of same shape as W
"""
# Initialize the loss and gradient to zero.
loss = 0.0
dW = np.zeros_like(W)
#############################################################################
# TODO: Compute the softmax loss and its gradient using explicit loops. #
# Store the loss in loss and the gradient in dW. If you are not careful #
# here, it is easy to run into numeric instability. Don't forget the #
# regularization! #
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
D, C = W.shape
N, D = X.shape
F = X @ W ## N x C
S = np.exp(F) / np.sum(np.exp(F), axis=1, keepdims=True)
for i in range(N):
s = S[i, :]
loss -= np.log(s[y[i]])
loss /= N
dS = np.copy(S)
for i in range(N):
dS[i, y[i]] -= 1
dW = X.T @ dS
dW /= N
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return loss, dW
def softmax_loss_vectorized(W, X, y, reg=0.0):
"""
Softmax loss function, vectorized version.
Inputs and outputs are t he same as softmax_loss_naive.
"""
# Initialize the loss and gradient to zero.
loss = 0.0
dW = np.zeros_like(W)
#############################################################################
# TODO: Compute the softmax loss and its gradient using no explicit loops. #
# Store the loss in loss and the gradient in dW. If you are not careful #
# here, it is easy to run into numeric instability. Don't forget the #
# regularization! #
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
D, C = W.shape
N, D = X.shape
F = X @ W
F -= np.max(F, axis=1, keepdims=True)
S = np.exp(F) / np.sum(np.exp(F), axis=1, keepdims=True)
loss -= np.mean(np.log(S[np.arange(N), y]))
dS = np.copy(S)
dS[np.arange(N), y] -= 1
dW = X.T @ dS
dW /= N
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return loss, dW
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 86,
"id": "loose-authorization",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"import numpy as np\n",
"import torch\n",
"from cs231n.classifiers.softmax import *\n",
"from cs231n.gradient_check import grad_check_sparse\n",
"\n",
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "markdown",
"id": "swedish-cathedral",
"metadata": {},
"source": [
"So what are we going to do here - implement softmax in `numpy` (both forward pass and backward pass) and compare results with:\n",
"- numerical computations;\n",
"- `pytorch` computetions;"
]
},
{
"cell_type": "markdown",
"id": "overall-scottish",
"metadata": {},
"source": [
"## 00 - generate data"
]
},
{
"cell_type": "markdown",
"id": "greenhouse-orientation",
"metadata": {},
"source": [
"Inputs have dimension `D`, there are `C` classes, and we operate on minibatches of `N` examples. \n",
"\n",
"Inputs:\n",
"- `W`: A numpy array of shape `(D, C)` containing weights;\n",
"- `X`: A numpy array of shape `(N, D)` containing a minibatch of data;\n",
"- `y`: A numpy array of shape `(N,)` containing training labels;"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "integral-macintosh",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<torch._C.Generator at 0x11cd1bab0>"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.manual_seed(42)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "rational-block",
"metadata": {},
"outputs": [],
"source": [
"N, D, C = 10, 2, 3\n",
"\n",
"W_tens = torch.randn([D, C], requires_grad=True)\n",
"X_tens = torch.randn([N, D], requires_grad=True)\n",
"y_tens = torch.randint(low=0, high=C, size=(N,))\n",
"\n",
"W = W_tens.detach().numpy()\n",
"X = X_tens.detach().numpy()\n",
"y = y_tens.detach().numpy()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "colonial-christmas",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[ 0.3367, 0.1288, 0.2345],\n",
" [ 0.2303, -1.1229, -0.1863]], requires_grad=True)"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"W_tens"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "based-drove",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.33669037, 0.1288094 , 0.23446237],\n",
" [ 0.23033303, -1.1228564 , -0.18632829]], dtype=float32)"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"W"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "annual-cable",
"metadata": {},
"outputs": [],
"source": [
"F_tens = torch.mm(X_tens, W_tens)\n",
"F = F_tens.detach().numpy()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "crucial-battery",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[-0.3619, 1.3247, 0.1669],\n",
" [-0.0410, -0.6456, -0.2089],\n",
" [-0.6669, 0.7999, -0.1623]], grad_fn=<SliceBackward>)"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"F_tens[:3, :]"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "accompanied-senegal",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.36187768, 1.3246738 , 0.16691756],\n",
" [-0.04102264, -0.6455607 , -0.2089102 ],\n",
" [-0.6668808 , 0.799914 , -0.16231792]], dtype=float32)"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"F[:3, :]"
]
},
{
"cell_type": "markdown",
"id": "reverse-borough",
"metadata": {},
"source": [
"## 01 - forward pass"
]
},
{
"cell_type": "markdown",
"id": "angry-correlation",
"metadata": {},
"source": [
"First let's try our naive and vectorized versions."
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "architectural-nerve",
"metadata": {},
"outputs": [],
"source": [
"loss_naive, _ = softmax_loss_naive(W, X, y)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "divine-cornell",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.4709\n"
]
}
],
"source": [
"print(f'{loss_naive:.4f}')"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "perceived-closing",
"metadata": {},
"outputs": [],
"source": [
"loss_vec, _ = softmax_loss_vectorized(W, X, y)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "effective-phoenix",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.4709\n"
]
}
],
"source": [
"print(f'{loss_vec:.4f}')"
]
},
{
"cell_type": "markdown",
"id": "educational-portugal",
"metadata": {},
"source": [
"Now let's try `torch` loss. First we'll try it with a class `CrossEntropyLoss` and then with a function. Looks like everything works just fine. That's a very important exercise to understand how `pytorch` actually works. \n",
"\n",
"We may read in [documentation](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html) the formula for it that is exactly the same we use in cs231n:\n",
"\n",
"$$loss(x, class) = −log\\left(\\frac{exp(x_{class})}{\\sum_j{exp(x_j)}}\\right)$$\n",
"The losses are averaged across observations for each minibatch, again, like we do in cs231n."
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "young-february",
"metadata": {},
"outputs": [],
"source": [
"loss_object = torch.nn.CrossEntropyLoss()"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "behind-convention",
"metadata": {},
"outputs": [],
"source": [
"loss_torch = loss_object(F_tens, y_tens)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "graphic-brook",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor(1.4709, grad_fn=<NllLossBackward>)"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"loss_torch"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "tracked-recommendation",
"metadata": {},
"outputs": [],
"source": [
"# loss_torch_fun = torch.nn.functional.cross_entropy(F_tens, y_tens)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "comprehensive-steam",
"metadata": {},
"outputs": [],
"source": [
"# loss_torch_fun"
]
},
{
"cell_type": "markdown",
"id": "worth-festival",
"metadata": {},
"source": [
"## 02 - backward pass"
]
},
{
"cell_type": "markdown",
"id": "atomic-timeline",
"metadata": {},
"source": [
"The next question - backward pass. It's a bit more involved but let's start from the formula for softmax derivative. We use the following [tutorial from cs231n](https://cs231n.github.io/neural-networks-case-study/#grad)."
]
},
{
"cell_type": "markdown",
"id": "hidden-throat",
"metadata": {},
"source": [
"### 02-1 math for grad"
]
},
{
"cell_type": "markdown",
"id": "celtic-patent",
"metadata": {},
"source": [
"#### 02-1-1 softmax derivative"
]
},
{
"cell_type": "markdown",
"id": "moderate-movie",
"metadata": {},
"source": [
"Softmax is the vector function: \n",
"$$s: \\mathbb{R^n} \\rightarrow \\mathbb{R^n}$$ \n",
"$$s(f) = \\begin{bmatrix} \\frac{e^{f_1}}{\\sum_j{e^{f_j}}} \\\\ ... \\\\\\frac{e^{f_1}}{\\sum_j{e^{f_j}}} \\end{bmatrix}$$\n",
"so its Jacobian matrix has the size $n \\times n$."
]
},
{
"cell_type": "markdown",
"id": "dying-control",
"metadata": {},
"source": [
"Jacobian is computed as follows:\n",
"\n",
"$$\\frac{\\partial s_i}{\\partial f_j} = \\delta_{ij} s_i - s_i s_j$$"
]
},
{
"cell_type": "markdown",
"id": "confused-devon",
"metadata": {},
"source": [
"Now let's compute derivative with respect to $W$. The only non-zero term: \n",
"\n",
"$$\\frac{\\partial f_k}{\\partial W_{jk}} = x_j$$"
]
},
{
"cell_type": "markdown",
"id": "printable-miniature",
"metadata": {},
"source": [
"$$\\frac{\\partial s_i}{\\partial W_{jk}} = \\frac{\\partial s_i}{\\partial f_1} \\frac{\\partial f_1}{\\partial W_{jk}} + ... = \\frac{\\partial s_i}{\\partial f_k} x_j = (\\delta_{ik} s_i - s_i s_k) x_j$$"
]
},
{
"cell_type": "markdown",
"id": "grateful-recording",
"metadata": {},
"source": [
"To compute this using Jacobians - see [this post](https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/)."
]
},
{
"cell_type": "markdown",
"id": "correct-rings",
"metadata": {},
"source": [
"#### 02-1-2 cross-entropy for 1 example "
]
},
{
"cell_type": "markdown",
"id": "mechanical-corrections",
"metadata": {},
"source": [
"Now what's the cross-entropy loss for 1 training example? \n",
"\n",
"$$L_i = -log \\left( \\frac{exp(f_{y_i})}{\\exp(\\sum_j{f_j})} \\right) = -log(s_{y_i})$$\n",
"\n",
"Basically we have to take negative log of the right component of softmax: $s_{y_i}$. "
]
},
{
"cell_type": "markdown",
"id": "designed-creator",
"metadata": {},
"source": [
"Let's compute its derivative. First of all its a vector from $\\mathbb{R}^n$. We have:\n",
"\n",
"$$\\frac{\\partial L_i}{\\partial f_j} = \n",
"-\\frac{1}{s_{y_i}} \\frac{\\partial s_{y_i}}{\\partial f_j}=\n",
"-\\frac{1}{s_{y_i}} (\\delta_{y_ij} s_{y_i} - s_{y_i}s_j) = \n",
"s_j - \\delta_{y_ij} $$\n",
"\n",
"$$\\nabla_f L_i = s - \\mathbb{1}(y_i = j)$$"
]
},
{
"cell_type": "markdown",
"id": "intelligent-uruguay",
"metadata": {},
"source": [
"Now let's compute derivative with respect to $W$. We are still talking about only one training examples."
]
},
{
"cell_type": "markdown",
"id": "offensive-rebound",
"metadata": {},
"source": [
"$$\n",
"\\frac{\\partial L_i}{\\partial W_{jk}} = \n",
"\\frac{\\partial L_i}{\\partial f_1} \\frac{\\partial f_1}{\\partial W_{jk}} + ... = \n",
"\\frac{\\partial L_i}{\\partial f_k} \\frac{\\partial f_k}{\\partial W_{jk}} =\n",
"\\frac{\\partial L_i}{\\partial f_k} x_j = \n",
"(s_k - \\delta_{y_ik}) x_j\n",
"$$"
]
},
{
"cell_type": "markdown",
"id": "hydraulic-turning",
"metadata": {},
"source": [
"#### 02-1-3 cross-entropy for $N$ examples"
]
},
{
"cell_type": "markdown",
"id": "portable-minority",
"metadata": {},
"source": [
"Suppose now we have 2 examples (for simplicity). Let's sketch a proof that:\n",
"\n",
"$$\\nabla_W L = 1/2\\ X^T \\tilde{S}$$\n",
"\n",
"where $\\tilde{S}$ is $S$ with subtracted $1$s. Suppose even that we don't need to subtract those $1$s."
]
},
{
"cell_type": "markdown",
"id": "piano-adrian",
"metadata": {},
"source": [
"$$\n",
"X = \\begin{bmatrix} x^{(1)T} \\\\ x^{(2)T} \\end{bmatrix} \\ (2, D); \\ \n",
"X^T = \\begin{bmatrix} x^{(1)} \\ x^{(2)} \\end{bmatrix} \\ (D, 2)\n",
"$$\n",
"\n",
"$$\n",
"S = \n",
"\\begin{bmatrix} s(f^{(1)}) \\\\ s(f^{(2)}) \\end{bmatrix} = \n",
"\\begin{bmatrix} s^{(1)} \\\\ s^{(2)} \\end{bmatrix} \\ (2, C)\n",
"$$"
]
},
{
"cell_type": "markdown",
"id": "digital-editor",
"metadata": {},
"source": [
"So first of all dimensions of $X^T S$ is $(D, C)$ which is dimension of $W$. So dimensions are correct. Let's now look at the product."
]
},
{
"cell_type": "markdown",
"id": "separate-township",
"metadata": {},
"source": [
"$$\n",
"X^T S = x^{(1)} s^{(1)} + x^{(2)} s^{(2)}\n",
"$$"
]
},
{
"cell_type": "markdown",
"id": "constant-cache",
"metadata": {},
"source": [
"Here's we have a product of a column vector and a row vector with $ij$ elements that's just of the form $x_i s_j$:\n",
"\n",
"$$\n",
"x^{(1)} s^{(1)} = \n",
"\\begin{bmatrix} | \\\\ x^{(1)} \\\\ | \\end{bmatrix}\n",
"\\begin{bmatrix} - \\ s^{(1)} \\ - \\end{bmatrix}\n",
"$$\n",
"\n",
"$$\n",
"(x^{(1)} s^{(1)})_{ij} =\n",
"x^{(1)}_i s^{(1)}_j\n",
"$$"
]
},
{
"cell_type": "markdown",
"id": "suspected-eight",
"metadata": {},
"source": [
"Finally let's have a look at the derivative of our loss $L = (L_1 + L_2) / 2$:\n",
"\n",
"$$\n",
"\\frac{\\partial L_1}{\\partial W_{ij}} = x^{(1)}_i s^{(1)}_j\n",
"\\frac{\\partial L_2}{\\partial W_{ij}} = x^{(2)}_i s^{(2)}_j\n",
"\\frac{\\partial L}{\\partial W_{ij}} = (x^{(1)}_i s^{(1)}_j + x^{(2)}_i s^{(2)}_j) / 2\n",
"$$"
]
},
{
"cell_type": "markdown",
"id": "superior-bargain",
"metadata": {},
"source": [
"So we may see that:\n",
"\n",
"$$\n",
"(1/2) (X^T S)_{ij} = \\frac{\\partial L}{\\partial W_{ij}}\n",
"$$"
]
},
{
"cell_type": "markdown",
"id": "earned-revolution",
"metadata": {},
"source": [
"This concludes the sketch of our proof."
]
},
{
"cell_type": "markdown",
"id": "three-edmonton",
"metadata": {},
"source": [
"### 02-2 code for grad"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "governmental-mistress",
"metadata": {},
"outputs": [],
"source": [
"_, dW = softmax_loss_naive(W, X, y)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "extra-rainbow",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.08522899, 0.11266414, -0.02743511],\n",
" [ 0.35102898, -0.41072887, 0.05969987]], dtype=float32)"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dW"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "threatened-heavy",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((2, 3), (2, 3))"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"W.shape, dW.shape "
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "round-milwaukee",
"metadata": {},
"outputs": [],
"source": [
"_, dW = softmax_loss_vectorized(W, X, y)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"id": "young-agenda",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.08522901, 0.11266414, -0.02743511],\n",
" [ 0.351029 , -0.41072887, 0.05969986]], dtype=float32)"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dW"
]
},
{
"cell_type": "markdown",
"id": "spare-latest",
"metadata": {},
"source": [
"Let's compare this with automatic gradient. Looks like our function works."
]
},
{
"cell_type": "code",
"execution_count": 79,
"id": "authentic-instrument",
"metadata": {},
"outputs": [],
"source": [
"loss_torch.backward()"
]
},
{
"cell_type": "code",
"execution_count": 80,
"id": "loaded-yugoslavia",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[-0.0852, 0.1127, -0.0274],\n",
" [ 0.3510, -0.4107, 0.0597]])"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"W_tens.grad"
]
},
{
"cell_type": "markdown",
"id": "asian-break",
"metadata": {},
"source": [
"### 02-3 numerical grad"
]
},
{
"cell_type": "markdown",
"id": "biblical-vulnerability",
"metadata": {},
"source": [
"Finally let's try to use numerical gradient. Results again are quite close."
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "automotive-beads",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n",
"numerical: 0.113249 analytic: 0.112664, relative error: 2.588095e-03\n",
"numerical: 0.113249 analytic: 0.112664, relative error: 2.588095e-03\n",
"numerical: -0.029802 analytic: -0.027435, relative error: 4.135772e-02\n",
"numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n",
"numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n",
"numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n",
"numerical: -0.029802 analytic: -0.027435, relative error: 4.135772e-02\n",
"numerical: -0.411272 analytic: -0.410729, relative error: 6.607987e-04\n",
"numerical: 0.053644 analytic: 0.059700, relative error: 5.342745e-02\n"
]
}
],
"source": [
"f = lambda w: softmax_loss_vectorized(w, X, y)[0]\n",
"grad_numerical = grad_check_sparse(f, W, dW, 10)"
]
},
{
"cell_type": "markdown",
"id": "searching-grade",
"metadata": {},
"source": [
"Let's have a look at how this numerical gradient is actually computed."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "becoming-joseph",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "lovely-marks",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "advised-unknown",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "reported-appearance",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "tight-differential",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "departmental-victory",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment