gngdb/MLP_Mixer_without_Linear.ipynb

## MLP_Mixer_without_Linear.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "MLP-Mixer",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "lObQ4yDh5l9x",
        "outputId": "f1f86d8d-547f-428a-fdb8-5a93d189beff"
      },
      "source": [
        "!pip install einops"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Requirement already satisfied: einops in /usr/local/lib/python3.7/dist-packages (0.3.0)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AAxH02j2EMgk"
      },
      "source": [
        "This is [lucidrain's implementation](https://github.com/lucidrains/mlp-mixer-pytorch/blob/main/mlp_mixer_pytorch/mlp_mixer_pytorch.py):"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "exJtcDPw4up-"
      },
      "source": [
        "from torch import nn\n",
        "from functools import partial\n",
        "from einops.layers.torch import Rearrange, Reduce\n",
        "\n",
        "class PreNormResidual(nn.Module):\n",
        "    def __init__(self, dim, fn):\n",
        "        super().__init__()\n",
        "        self.fn = fn\n",
        "        self.norm = nn.LayerNorm(dim)\n",
        "\n",
        "    def forward(self, x):\n",
        "        return self.fn(self.norm(x)) + x\n",
        "\n",
        "def FeedForward(dim, expansion_factor = 4, dropout = 0., dense = nn.Linear):\n",
        "    return nn.Sequential(\n",
        "        dense(dim, dim * expansion_factor),\n",
        "        nn.GELU(),\n",
        "        nn.Dropout(dropout),\n",
        "        dense(dim * expansion_factor, dim),\n",
        "        nn.Dropout(dropout)\n",
        "    )\n",
        "\n",
        "def MLPMixer(*, image_size, patch_size, dim, depth, num_classes, expansion_factor = 4, dropout = 0.):\n",
        "    assert (image_size % patch_size) == 0, 'image must be divisible by patch size'\n",
        "    num_patches = (image_size // patch_size) ** 2\n",
        "    chan_first, chan_last = partial(nn.Conv1d, kernel_size = 1), nn.Linear\n",
        "\n",
        "    return nn.Sequential(\n",
        "        Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),\n",
        "        nn.Linear((patch_size ** 2) * 3, dim),\n",
        "        *[nn.Sequential(\n",
        "            PreNormResidual(dim, FeedForward(num_patches, expansion_factor, dropout, chan_first)),\n",
        "            PreNormResidual(dim, FeedForward(dim, expansion_factor, dropout, chan_last))\n",
        "        ) for _ in range(depth)],\n",
        "        nn.LayerNorm(dim),\n",
        "        Reduce('b n c -> b c', 'mean'),\n",
        "        nn.Linear(dim, num_classes)\n",
        "    )"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "OS5_6jEl492Y"
      },
      "source": [
        "import torch\n",
        "\n",
        "model = MLPMixer(\n",
        "    image_size = 256,\n",
        "    patch_size = 16,\n",
        "    dim = 512,\n",
        "    depth = 12,\n",
        "    num_classes = 1000\n",
        ")\n",
        "\n",
        "img = torch.randn(1, 3, 256, 256)\n",
        "pred = model(img) # (1, 1000)\n",
        "params = model.state_dict()"
      ],
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "wiLEmz0VFFi8",
        "outputId": "f890e9dd-fb8b-484b-b77c-2cd376bf728c"
      },
      "source": [
        "import time\n",
        "batch = torch.randn(16, 3, 256, 256).cuda()\n",
        "model = model.cuda()\n",
        "before = time.time()\n",
        "for _ in range(10):\n",
        "    _ = model(batch)\n",
        "print(f\"Execution time {time.time() - before}\")"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Execution time 0.5921013355255127\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nS7Jv7t6EU8K"
      },
      "source": [
        "Here it is without any Linear modules:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "InJDOp_J5obZ"
      },
      "source": [
        "class PreNormResidual(nn.Module):\n",
        "    def __init__(self, dim, fn):\n",
        "        super().__init__()\n",
        "        self.fn = fn\n",
        "        self.norm = nn.LayerNorm(dim)\n",
        "\n",
        "    def forward(self, x):\n",
        "        return self.fn(self.norm(x)) + x\n",
        "\n",
        "def FeedForward(dim, expansion_factor = 4, dropout = 0., dense = None):\n",
        "    return nn.Sequential(\n",
        "        dense(dim, dim * expansion_factor),\n",
        "        nn.GELU(),\n",
        "        nn.Dropout(dropout),\n",
        "        dense(dim * expansion_factor, dim),\n",
        "        nn.Dropout(dropout)\n",
        "    )\n",
        "\n",
        "def MLPMixer(*, image_size, patch_size, dim, depth, num_classes, expansion_factor = 4, dropout = 0.):\n",
        "    assert (image_size % patch_size) == 0, 'image must be divisible by patch size'\n",
        "    num_patches = (image_size // patch_size) ** 2\n",
        "    def dense(dim_in, dim_out):\n",
        "        return nn.Sequential(Rearrange('b c n -> b n c'), nn.Conv1d(dim_in, dim_out, 1), Rearrange('b n c -> b c n'))\n",
        "    chan_first = partial(nn.Conv1d, kernel_size = 1)\n",
        "    chan_last = dense\n",
        "\n",
        "    return nn.Sequential(\n",
        "        Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),\n",
        "        dense((patch_size ** 2) * 3, dim),\n",
        "        *[nn.Sequential(\n",
        "            PreNormResidual(dim, FeedForward(num_patches, expansion_factor, dropout, chan_first)),\n",
        "            PreNormResidual(dim, FeedForward(dim, expansion_factor, dropout, chan_last))\n",
        "        ) for _ in range(depth)],\n",
        "        nn.LayerNorm(dim),\n",
        "        Reduce('b n c -> b () c', 'mean'),\n",
        "        dense(dim, num_classes),\n",
        "        Rearrange('b () c -> b c')\n",
        "    )"
      ],
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5UlUQ7cVEqFN"
      },
      "source": [
        "This cell verifies that it produces the same output:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ic6QGbRZ7wFm"
      },
      "source": [
        "_model = MLPMixer(\n",
        "    image_size = 256,\n",
        "    patch_size = 16,\n",
        "    dim = 512,\n",
        "    depth = 12,\n",
        "    num_classes = 1000\n",
        ")\n",
        "\n",
        "_params = {}\n",
        "keys = [k for k in params]\n",
        "for k in _model.state_dict():\n",
        "    p = params[keys.pop(0)]\n",
        "    _params[k] = p.unsqueeze(-1) if 'weight' in k and p.ndim == 2 else p\n",
        "_model.load_state_dict(_params)\n",
        "\n",
        "_pred = _model(img) # (1, 1000)\n",
        "assert torch.abs(pred - _pred).max() < 1e-3"
      ],
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Khn85urcFzzk",
        "outputId": "87148952-264e-4ffb-9011-9b3f503a51ba"
      },
      "source": [
        "_model = _model.cuda()\n",
        "before = time.time()\n",
        "for _ in range(10):\n",
        "    _ = _model(batch)\n",
        "print(f\"Execution time {time.time() - before}\")"
      ],
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Execution time 0.8272452354431152\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "MLP-Mixer",
	"provenance": [],
	"collapsed_sections": []
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "lObQ4yDh5l9x",
	"outputId": "f1f86d8d-547f-428a-fdb8-5a93d189beff"
	},
	"source": [
	"!pip install einops"
	],
	"execution_count": 1,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Requirement already satisfied: einops in /usr/local/lib/python3.7/dist-packages (0.3.0)\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "AAxH02j2EMgk"
	},
	"source": [
	"This is [lucidrain's implementation](https://github.com/lucidrains/mlp-mixer-pytorch/blob/main/mlp_mixer_pytorch/mlp_mixer_pytorch.py):"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "exJtcDPw4up-"
	},
	"source": [
	"from torch import nn\n",
	"from functools import partial\n",
	"from einops.layers.torch import Rearrange, Reduce\n",
	"\n",
	"class PreNormResidual(nn.Module):\n",
	" def __init__(self, dim, fn):\n",
	" super().__init__()\n",
	" self.fn = fn\n",
	" self.norm = nn.LayerNorm(dim)\n",
	"\n",
	" def forward(self, x):\n",
	" return self.fn(self.norm(x)) + x\n",
	"\n",
	"def FeedForward(dim, expansion_factor = 4, dropout = 0., dense = nn.Linear):\n",
	" return nn.Sequential(\n",
	" dense(dim, dim * expansion_factor),\n",
	" nn.GELU(),\n",
	" nn.Dropout(dropout),\n",
	" dense(dim * expansion_factor, dim),\n",
	" nn.Dropout(dropout)\n",
	" )\n",
	"\n",
	"def MLPMixer(*, image_size, patch_size, dim, depth, num_classes, expansion_factor = 4, dropout = 0.):\n",
	" assert (image_size % patch_size) == 0, 'image must be divisible by patch size'\n",
	" num_patches = (image_size // patch_size) ** 2\n",
	" chan_first, chan_last = partial(nn.Conv1d, kernel_size = 1), nn.Linear\n",
	"\n",
	" return nn.Sequential(\n",
	" Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),\n",
	" nn.Linear((patch_size ** 2) * 3, dim),\n",
	" *[nn.Sequential(\n",
	" PreNormResidual(dim, FeedForward(num_patches, expansion_factor, dropout, chan_first)),\n",
	" PreNormResidual(dim, FeedForward(dim, expansion_factor, dropout, chan_last))\n",
	" ) for _ in range(depth)],\n",
	" nn.LayerNorm(dim),\n",
	" Reduce('b n c -> b c', 'mean'),\n",
	" nn.Linear(dim, num_classes)\n",
	" )"
	],
	"execution_count": 2,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "OS5_6jEl492Y"
	},
	"source": [
	"import torch\n",
	"\n",
	"model = MLPMixer(\n",
	" image_size = 256,\n",
	" patch_size = 16,\n",
	" dim = 512,\n",
	" depth = 12,\n",
	" num_classes = 1000\n",
	")\n",
	"\n",
	"img = torch.randn(1, 3, 256, 256)\n",
	"pred = model(img) # (1, 1000)\n",
	"params = model.state_dict()"
	],
	"execution_count": 3,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "wiLEmz0VFFi8",
	"outputId": "f890e9dd-fb8b-484b-b77c-2cd376bf728c"
	},
	"source": [
	"import time\n",
	"batch = torch.randn(16, 3, 256, 256).cuda()\n",
	"model = model.cuda()\n",
	"before = time.time()\n",
	"for _ in range(10):\n",
	" _ = model(batch)\n",
	"print(f\"Execution time {time.time() - before}\")"
	],
	"execution_count": 4,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Execution time 0.5921013355255127\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "nS7Jv7t6EU8K"
	},
	"source": [
	"Here it is without any Linear modules:"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "InJDOp_J5obZ"
	},
	"source": [
	"class PreNormResidual(nn.Module):\n",
	" def __init__(self, dim, fn):\n",
	" super().__init__()\n",
	" self.fn = fn\n",
	" self.norm = nn.LayerNorm(dim)\n",
	"\n",
	" def forward(self, x):\n",
	" return self.fn(self.norm(x)) + x\n",
	"\n",
	"def FeedForward(dim, expansion_factor = 4, dropout = 0., dense = None):\n",
	" return nn.Sequential(\n",
	" dense(dim, dim * expansion_factor),\n",
	" nn.GELU(),\n",
	" nn.Dropout(dropout),\n",
	" dense(dim * expansion_factor, dim),\n",
	" nn.Dropout(dropout)\n",
	" )\n",
	"\n",
	"def MLPMixer(*, image_size, patch_size, dim, depth, num_classes, expansion_factor = 4, dropout = 0.):\n",
	" assert (image_size % patch_size) == 0, 'image must be divisible by patch size'\n",
	" num_patches = (image_size // patch_size) ** 2\n",
	" def dense(dim_in, dim_out):\n",
	" return nn.Sequential(Rearrange('b c n -> b n c'), nn.Conv1d(dim_in, dim_out, 1), Rearrange('b n c -> b c n'))\n",
	" chan_first = partial(nn.Conv1d, kernel_size = 1)\n",
	" chan_last = dense\n",
	"\n",
	" return nn.Sequential(\n",
	" Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),\n",
	" dense((patch_size ** 2) * 3, dim),\n",
	" *[nn.Sequential(\n",
	" PreNormResidual(dim, FeedForward(num_patches, expansion_factor, dropout, chan_first)),\n",
	" PreNormResidual(dim, FeedForward(dim, expansion_factor, dropout, chan_last))\n",
	" ) for _ in range(depth)],\n",
	" nn.LayerNorm(dim),\n",
	" Reduce('b n c -> b () c', 'mean'),\n",
	" dense(dim, num_classes),\n",
	" Rearrange('b () c -> b c')\n",
	" )"
	],
	"execution_count": 5,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "5UlUQ7cVEqFN"
	},
	"source": [
	"This cell verifies that it produces the same output:"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "ic6QGbRZ7wFm"
	},
	"source": [
	"_model = MLPMixer(\n",
	" image_size = 256,\n",
	" patch_size = 16,\n",
	" dim = 512,\n",
	" depth = 12,\n",
	" num_classes = 1000\n",
	")\n",
	"\n",
	"_params = {}\n",
	"keys = [k for k in params]\n",
	"for k in _model.state_dict():\n",
	" p = params[keys.pop(0)]\n",
	" _params[k] = p.unsqueeze(-1) if 'weight' in k and p.ndim == 2 else p\n",
	"_model.load_state_dict(_params)\n",
	"\n",
	"_pred = _model(img) # (1, 1000)\n",
	"assert torch.abs(pred - _pred).max() < 1e-3"
	],
	"execution_count": 6,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "Khn85urcFzzk",
	"outputId": "87148952-264e-4ffb-9011-9b3f503a51ba"
	},
	"source": [
	"_model = _model.cuda()\n",
	"before = time.time()\n",
	"for _ in range(10):\n",
	" _ = _model(batch)\n",
	"print(f\"Execution time {time.time() - before}\")"
	],
	"execution_count": 7,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Execution time 0.8272452354431152\n"
	],
	"name": "stdout"
	}
	]
	}
	]
	}