Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save akaanirban/76427ef338d39b45f9858036b99f4ca3 to your computer and use it in GitHub Desktop.
Save akaanirban/76427ef338d39b45f9858036b99f4ca3 to your computer and use it in GitHub Desktop.
Different ways to perform gradient accumulation.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Different ways to perform gradient accumulation.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyNfXRuve6/wKWr8q1FO9WQw",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/akaanirban/76427ef338d39b45f9858036b99f4ca3/different-ways-to-perform-gradient-accumulation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BubD_LHm8FWT"
},
"source": [
"## Different ways to perform gradient accumulation "
]
},
{
"cell_type": "code",
"metadata": {
"id": "iMo8XExI2Qob"
},
"source": [
"import torch\n",
"import math"
],
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "waFqq-nE2Sfg"
},
"source": [
"dtype = torch.float\n",
"device = torch.device(\"cpu\")\n",
"# device = torch.device(\"cuda:0\") # Uncomment this to run on GPU"
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "23pczZ6X8Ith"
},
"source": [
"### 1. Normal full batch gradient"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "fbCl2u1y2kkJ",
"outputId": "96fd895e-41c4-4c8d-90e2-2a1a6a3453bf"
},
"source": [
"# Create Tensors to hold input and outputs.\n",
"# By default, requires_grad=False, which indicates that we do not need to\n",
"# compute gradients with respect to these Tensors during the backward pass.\n",
"X = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)\n",
"Y = torch.sin(X)\n",
"\n",
"# Create random Tensors for weights. For a third order polynomial, we need\n",
"# 4 weights: y = a + b x + c x^2 + d x^3\n",
"# Setting requires_grad=True indicates that we want to compute gradients with\n",
"# respect to these Tensors during the backward pass.\n",
"m = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n",
"n = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n",
"o = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n",
"p = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n",
"\n",
"a, b, c, d = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n",
"a.requires_grad=True\n",
"b.requires_grad=True\n",
"c.requires_grad=True\n",
"d.requires_grad=True\n",
"\n",
"learning_rate = 1e-6\n",
"for t in range(1):\n",
" # Forward pass: compute predicted y using operations on Tensors.\n",
" y_pred = a + b * X + c * X ** 2 + d * X ** 3\n",
"\n",
" # Compute and print loss using operations on Tensors.\n",
" # Now loss is a Tensor of shape (1,)\n",
" # loss.item() gets the scalar value held in the loss.\n",
" loss = (y_pred - Y).pow(2).sum()\n",
" if t % 100 == 99:\n",
" print(t, loss.item())\n",
"\n",
" # Use autograd to compute the backward pass. This call will compute the\n",
" # gradient of loss with respect to all Tensors with requires_grad=True.\n",
" # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding\n",
" # the gradient of the loss with respect to a, b, c, d respectively.\n",
" loss.backward()\n",
"\n",
" # Manually update weights using gradient descent. Wrap in torch.no_grad()\n",
" # because weights have requires_grad=True, but we don't need to track this\n",
" # in autograd.\n",
" with torch.no_grad():\n",
" a -= learning_rate * a.grad\n",
" b -= learning_rate * b.grad\n",
" c -= learning_rate * c.grad\n",
" d -= learning_rate * d.grad\n",
"\n",
" # # Manually zero the gradients after updating weights\n",
"\n",
"\n",
"print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Result: y = -0.09324247390031815 + -0.17653822898864746 x + 0.2124224752187729 x^2 + 0.699370801448822 x^3\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "SGuxppb82n1T",
"outputId": "3a2631a6-b738-4e18-f3f2-f8746ff4ad29"
},
"source": [
"a.grad, b.grad, c.grad, d.grad"
],
"execution_count": 4,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(tensor(2655.7627),\n",
" tensor(113246.4297),\n",
" tensor(16697.1406),\n",
" tensor(812837.1875))"
]
},
"metadata": {},
"execution_count": 4
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "nOfCFx9_8OJB"
},
"source": [
"### 2. By appending the loss to a total loss and then calculating the gradient on the total loss at the end."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "6b8YcoJW3UKb",
"outputId": "08fd1858-7d8f-47be-abe2-082bc63450f4"
},
"source": [
"e, f, g, h = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n",
"e.requires_grad=True\n",
"f.requires_grad=True\n",
"g.requires_grad=True\n",
"h.requires_grad=True\n",
"\n",
"learning_rate = 1e-6\n",
"total_loss = 0\n",
"for i in range(2000):\n",
" # Forward pass: compute predicted y using operations on Tensors.\n",
" y_pred = e + f * X[i] + g * X[i] ** 2 + h * X[i] ** 3\n",
"\n",
" # Compute and print loss using operations on Tensors.\n",
" # Now loss is a Tensor of shape (1,)\n",
" # loss.item() gets the scalar value held in the loss.\n",
" total_loss += (y_pred - Y[i]).pow(2).sum()\n",
" \n",
"total_loss.backward()\n",
"\n",
"# Manually update weights using gradient descent. Wrap in torch.no_grad()\n",
"# because weights have requires_grad=True, but we don't need to track this\n",
"# in autograd.\n",
"with torch.no_grad():\n",
" e -= learning_rate * e.grad\n",
" f -= learning_rate * f.grad\n",
" g -= learning_rate * g.grad\n",
" h -= learning_rate * h.grad\n",
"\n",
" # # Manually zero the gradients after updating weights\n",
"\n",
"\n",
"print(f'Result: y = {e.item()} + {f.item()} x + {g.item()} x^2 + {h.item()} x^3')"
],
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Result: y = -0.09324248135089874 + -0.17653831839561462 x + 0.21242240071296692 x^2 + 0.6993705630302429 x^3\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4prg-P9C6X0g",
"outputId": "902e7c80-76b3-4f9b-8d48-9dd636a32e9a"
},
"source": [
"e.grad, f.grad, g.grad, h.grad"
],
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(tensor(2655.7666),\n",
" tensor(113246.5156),\n",
" tensor(16697.2148),\n",
" tensor(812837.4375))"
]
},
"metadata": {},
"execution_count": 6
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XF_txP5C8eAH"
},
"source": [
"### By calculating gradients on multiple batches without zeroing out the gradients and then take a step"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "JxyoPjcu6K1e",
"outputId": "5b3bb7f1-7245-4677-d78f-df9273d45556"
},
"source": [
"w, x, y, z = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n",
"w.requires_grad=True\n",
"x.requires_grad=True\n",
"y.requires_grad=True\n",
"z.requires_grad=True\n",
"\n",
"learning_rate = 1e-6\n",
"\n",
"for i in range(2000):\n",
" # Forward pass: compute predicted y using operations on Tensors.\n",
" y_pred = w + x * X[i] + y * X[i] ** 2 + z * X[i] ** 3\n",
"\n",
" # Compute and print loss using operations on Tensors.\n",
" # Now loss is a Tensor of shape (1,)\n",
" # loss.item() gets the scalar value held in the loss.\n",
" total_loss = (y_pred - Y[i]).pow(2).sum()\n",
" total_loss.backward()\n",
"\n",
"# Manually update weights using gradient descent. Wrap in torch.no_grad()\n",
"# because weights have requires_grad=True, but we don't need to track this\n",
"# in autograd.\n",
"with torch.no_grad():\n",
" w -= learning_rate * w.grad\n",
" x -= learning_rate * x.grad\n",
" y -= learning_rate * y.grad\n",
" z -= learning_rate * z.grad\n",
"\n",
" # # Manually zero the gradients after updating weights\n",
"\n",
"\n",
"print(f'Result: y = {w.item()} + {x.item()} x + {y.item()} x^2 + {z.item()} x^3')"
],
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Result: y = -0.09324248880147934 + -0.176538348197937 x + 0.2124224454164505 x^2 + 0.6993707418441772 x^3\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oTKTRahG69cl",
"outputId": "1b93af5d-b6ee-494f-f932-a730940e9d43"
},
"source": [
"w.grad, x.grad, y.grad, z.grad"
],
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(tensor(2655.7747),\n",
" tensor(113246.5547),\n",
" tensor(16697.1738),\n",
" tensor(812837.2500))"
]
},
"metadata": {},
"execution_count": 8
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "zCAIhRs_8sM_"
},
"source": [
"### 4. Point 3. works if we divide the dataset into batches. "
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "z2V-xUzM7mTb",
"outputId": "8bca2fc5-3e7f-4e00-a0fc-190f65fdfd61"
},
"source": [
"import numpy as np \n",
"w, x, y, z = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n",
"w.requires_grad=True\n",
"x.requires_grad=True\n",
"y.requires_grad=True\n",
"z.requires_grad=True\n",
"\n",
"learning_rate = 1e-6\n",
"batch_size =200\n",
"num_batches = int(np.ceil(X.shape[0]/batch_size))\n",
"\n",
"for i in range(num_batches):\n",
" # Forward pass: compute predicted y using operations on Tensors.\n",
" y_pred = w + x * X[i*batch_size: (i+1)*batch_size] + y * X[i*batch_size: (i+1)*batch_size] ** 2 + z * X[i*batch_size: (i+1)*batch_size] ** 3\n",
"\n",
" # Compute and print loss using operations on Tensors.\n",
" # Now loss is a Tensor of shape (1,)\n",
" # loss.item() gets the scalar value held in the loss.\n",
" total_loss = (y_pred - Y[i*batch_size: (i+1)*batch_size]).pow(2).sum()\n",
" total_loss.backward()\n",
"\n",
"# Manually update weights using gradient descent. Wrap in torch.no_grad()\n",
"# because weights have requires_grad=True, but we don't need to track this\n",
"# in autograd.\n",
"with torch.no_grad():\n",
" w -= learning_rate * w.grad\n",
" x -= learning_rate * x.grad\n",
" y -= learning_rate * y.grad\n",
" z -= learning_rate * z.grad\n",
"\n",
" # # Manually zero the gradients after updating weights\n",
"\n",
"\n",
"print(f'Result: y = {w.item()} + {x.item()} x + {y.item()} x^2 + {z.item()} x^3')"
],
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Result: y = -0.09324248135089874 + -0.17653822898864746 x + 0.2124224752187729 x^2 + 0.6993708610534668 x^3\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4THMzuTA7-EJ",
"outputId": "f45ffa00-2cbf-4ccc-858f-2507cb406347"
},
"source": [
"w.grad, x.grad, y.grad, z.grad"
],
"execution_count": 11,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(tensor(2655.7646),\n",
" tensor(113246.4219),\n",
" tensor(16697.1328),\n",
" tensor(812837.1250))"
]
},
"metadata": {},
"execution_count": 11
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_sqYSRsY80Xt"
},
"source": [
"### Ref: https://pytorch.org/tutorials/beginner/pytorch_with_examples.html and https://discuss.pytorch.org/t/why-do-we-need-to-set-the-gradients-manually-to-zero-in-pytorch/4903/20"
]
},
{
"cell_type": "code",
"metadata": {
"id": "qWc5UHml85Vj"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment