yoshipon/torch_vs_cupy.ipynb

## torch_vs_cupy.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "cb572805",
   "metadata": {},
   "outputs": [],
   "source": [
    "from contextlib import contextmanager\n",
    "\n",
    "import torch\n",
    "import cupy as cp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ba845d71",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting environment information...\n",
      "PyTorch version: 1.13.1+cu117\n",
      "Is debug build: False\n",
      "CUDA used to build PyTorch: 11.7\n",
      "ROCM used to build PyTorch: N/A\n",
      "\n",
      "OS: Ubuntu 20.04.5 LTS (x86_64)\n",
      "GCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
      "Clang version: Could not collect\n",
      "CMake version: version 3.16.3\n",
      "Libc version: glibc-2.31\n",
      "\n",
      "Python version: 3.9.13 (main, Aug 25 2022, 23:26:10)  [GCC 11.2.0] (64-bit runtime)\n",
      "Python platform: Linux-3.10.0-862.el7.x86_64-x86_64-with-glibc2.31\n",
      "Is CUDA available: True\n",
      "CUDA runtime version: 11.7.99\n",
      "CUDA_MODULE_LOADING set to: LAZY\n",
      "GPU models and configuration: \n",
      "GPU 0: Tesla V100-SXM2-16GB\n",
      "GPU 1: Tesla V100-SXM2-16GB\n",
      "GPU 2: Tesla V100-SXM2-16GB\n",
      "GPU 3: Tesla V100-SXM2-16GB\n",
      "\n",
      "Nvidia driver version: 510.47.03\n",
      "cuDNN version: Probably one of the following:\n",
      "/usr/lib/x86_64-linux-gnu/libcudnn.so.8.5.0\n",
      "/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.5.0\n",
      "/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.5.0\n",
      "/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.5.0\n",
      "/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.5.0\n",
      "/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.5.0\n",
      "/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.5.0\n",
      "HIP runtime version: N/A\n",
      "MIOpen runtime version: N/A\n",
      "Is XNNPACK available: True\n",
      "\n",
      "Versions of relevant libraries:\n",
      "[pip3] mypy==0.991\n",
      "[pip3] mypy-extensions==0.4.3\n",
      "[pip3] numpy==1.21.5\n",
      "[pip3] numpydoc==1.4.0\n",
      "[pip3] pytorch-ignite==0.4.10\n",
      "[pip3] torch==1.13.1+cu117\n",
      "[pip3] torchaudio==0.13.1+cu117\n",
      "[pip3] torchvision==0.14.1+cu117\n",
      "[conda] blas                      1.0                         mkl  \n",
      "[conda] mkl                       2021.4.0           h06a4308_640  \n",
      "[conda] mkl-service               2.4.0            py39h7f8727e_0  \n",
      "[conda] mkl_fft                   1.3.1            py39hd3c417c_0  \n",
      "[conda] mkl_random                1.2.2            py39h51133e4_0  \n",
      "[conda] numpy                     1.21.5           py39h6c91a56_3  \n",
      "[conda] numpy-base                1.21.5           py39ha15fc14_3  \n",
      "[conda] numpydoc                  1.4.0            py39h06a4308_0  \n",
      "[conda] pytorch-ignite            0.4.10                   pypi_0    pypi\n",
      "[conda] torch                     1.13.1+cu117             pypi_0    pypi\n",
      "[conda] torchaudio                0.13.1+cu117             pypi_0    pypi\n",
      "[conda] torchvision               0.14.1+cu117             pypi_0    pypi\n"
     ]
    }
   ],
   "source": [
    "! python collect_env.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "abe8a9ac",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OS                           : Linux-3.10.0-862.el7.x86_64-x86_64-with-glibc2.31\n",
      "Python Version               : 3.9.13\n",
      "CuPy Version                 : 11.4.0\n",
      "CuPy Platform                : NVIDIA CUDA\n",
      "NumPy Version                : 1.21.5\n",
      "SciPy Version                : 1.8.1\n",
      "Cython Build Version         : 0.29.32\n",
      "Cython Runtime Version       : 0.29.32\n",
      "CUDA Root                    : /usr/local/cuda\n",
      "nvcc PATH                    : /usr/local/cuda/bin/nvcc\n",
      "CUDA Build Version           : 11080\n",
      "CUDA Driver Version          : 11060\n",
      "CUDA Runtime Version         : 11070\n",
      "cuBLAS Version               : (available)\n",
      "cuFFT Version                : 10702\n",
      "cuRAND Version               : 10210\n",
      "cuSOLVER Version             : (11, 4, 0)\n",
      "cuSPARSE Version             : (available)\n",
      "NVRTC Version                : (11, 7)\n",
      "Thrust Version               : 101501\n",
      "CUB Build Version            : 101501\n",
      "Jitify Build Version         : 4a37de0\n",
      "cuDNN Build Version          : 8600\n",
      "cuDNN Version                : 8500\n",
      "NCCL Build Version           : 21505\n",
      "NCCL Runtime Version         : 21304\n",
      "cuTENSOR Version             : None\n",
      "cuSPARSELt Build Version     : None\n",
      "Device 0 Name                : Tesla V100-SXM2-16GB\n",
      "Device 0 Compute Capability  : 70\n",
      "Device 0 PCI Bus ID          : 0000:3D:00.0\n",
      "Device 1 Name                : Tesla V100-SXM2-16GB\n",
      "Device 1 Compute Capability  : 70\n",
      "Device 1 PCI Bus ID          : 0000:3E:00.0\n",
      "Device 2 Name                : Tesla V100-SXM2-16GB\n",
      "Device 2 Compute Capability  : 70\n",
      "Device 2 PCI Bus ID          : 0000:B1:00.0\n",
      "Device 3 Name                : Tesla V100-SXM2-16GB\n",
      "Device 3 Compute Capability  : 70\n",
      "Device 3 PCI Bus ID          : 0000:B2:00.0\n"
     ]
    }
   ],
   "source": [
    "cp.show_config()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3cdda1a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def _torch_alloc(size):\n",
    "    device = cp.cuda.Device().id\n",
    "    tensor = torch.empty(size, dtype=torch.uint8, device=device)\n",
    "    return cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(tensor.data_ptr(), size, tensor), 0)\n",
    "\n",
    "cp.cuda.set_allocator(_torch_alloc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "011cb567",
   "metadata": {},
   "outputs": [],
   "source": [
    "def cupy_inv(x_):\n",
    "    x = cp.from_dlpack(x_)\n",
    "    return torch.from_dlpack(cp.linalg.inv(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "5d3cf714",
   "metadata": {},
   "outputs": [],
   "source": [
    "dtype = torch.complex64"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14c9d1e8",
   "metadata": {},
   "source": [
    "# inv for 100000 x 4 x 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "00d60af4",
   "metadata": {},
   "outputs": [],
   "source": [
    "size = (100000, 4, 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "1d82ccb7",
   "metadata": {},
   "outputs": [],
   "source": [
    "_ = cupy_inv(torch.randn(*size, dtype=dtype, device=\"cuda\"))  # initialize CuPy's kernel\n",
    "torch.cuda.synchronize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9144c2cb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "639 µs ± 270 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
    "y = torch.linalg.inv(x)\n",
    "torch.cuda.synchronize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e3162cfe",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "288 µs ± 1.83 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
    "y = cupy_inv(x)\n",
    "torch.cuda.synchronize()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f191dccf",
   "metadata": {},
   "source": [
    "# inv for 100000 x 8 x 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "68655f98",
   "metadata": {},
   "outputs": [],
   "source": [
    "size = (100000, 8, 8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d415cd3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "_ = cupy_inv(torch.randn(*size, dtype=dtype, device=\"cuda\"))  # initialize CuPy's kernel\n",
    "torch.cuda.synchronize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "93362ea7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.47 ms ± 358 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
    "y = torch.linalg.inv(x)\n",
    "torch.cuda.synchronize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "7cc0aa71",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "697 µs ± 1.33 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
    "y = cupy_inv(x)\n",
    "torch.cuda.synchronize()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a67f4a27",
   "metadata": {},
   "source": [
    "# inv for 100000 x 16 x 16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "082c2aed",
   "metadata": {},
   "outputs": [],
   "source": [
    "size = (100000, 16, 16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "3dd8c8ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "_ = cupy_inv(torch.randn(*size, dtype=dtype, device=\"cuda\"))  # initialize CuPy's kernel\n",
    "torch.cuda.synchronize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "3272c1af",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "11.9 ms ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
    "y = torch.linalg.inv(x)\n",
    "torch.cuda.synchronize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "4714cd63",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4.39 ms ± 846 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
    "y = cupy_inv(x)\n",
    "torch.cuda.synchronize()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e54fb8ed",
   "metadata": {},
   "source": [
    "# inv for 100000 x 32 x 32"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "a75e181f",
   "metadata": {},
   "outputs": [],
   "source": [
    "size = (100000, 32, 32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "555e2610",
   "metadata": {},
   "outputs": [],
   "source": [
    "_ = cupy_inv(torch.randn(*size, dtype=dtype, device=\"cuda\"))  # initialize CuPy's kernel\n",
    "torch.cuda.synchronize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "99fd1017",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "36.6 ms ± 67.5 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
    "y = torch.linalg.inv(x)\n",
    "torch.cuda.synchronize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "a57a4192",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "24.5 ms ± 6.64 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
    "y = cupy_inv(x)\n",
    "torch.cuda.synchronize()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "cb572805",
	"metadata": {},
	"outputs": [],
	"source": [
	"from contextlib import contextmanager\n",
	"\n",
	"import torch\n",
	"import cupy as cp"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "ba845d71",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Collecting environment information...\n",
	"PyTorch version: 1.13.1+cu117\n",
	"Is debug build: False\n",
	"CUDA used to build PyTorch: 11.7\n",
	"ROCM used to build PyTorch: N/A\n",
	"\n",
	"OS: Ubuntu 20.04.5 LTS (x86_64)\n",
	"GCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
	"Clang version: Could not collect\n",
	"CMake version: version 3.16.3\n",
	"Libc version: glibc-2.31\n",
	"\n",
	"Python version: 3.9.13 (main, Aug 25 2022, 23:26:10) [GCC 11.2.0] (64-bit runtime)\n",
	"Python platform: Linux-3.10.0-862.el7.x86_64-x86_64-with-glibc2.31\n",
	"Is CUDA available: True\n",
	"CUDA runtime version: 11.7.99\n",
	"CUDA_MODULE_LOADING set to: LAZY\n",
	"GPU models and configuration: \n",
	"GPU 0: Tesla V100-SXM2-16GB\n",
	"GPU 1: Tesla V100-SXM2-16GB\n",
	"GPU 2: Tesla V100-SXM2-16GB\n",
	"GPU 3: Tesla V100-SXM2-16GB\n",
	"\n",
	"Nvidia driver version: 510.47.03\n",
	"cuDNN version: Probably one of the following:\n",
	"/usr/lib/x86_64-linux-gnu/libcudnn.so.8.5.0\n",
	"/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.5.0\n",
	"/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.5.0\n",
	"/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.5.0\n",
	"/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.5.0\n",
	"/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.5.0\n",
	"/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.5.0\n",
	"HIP runtime version: N/A\n",
	"MIOpen runtime version: N/A\n",
	"Is XNNPACK available: True\n",
	"\n",
	"Versions of relevant libraries:\n",
	"[pip3] mypy==0.991\n",
	"[pip3] mypy-extensions==0.4.3\n",
	"[pip3] numpy==1.21.5\n",
	"[pip3] numpydoc==1.4.0\n",
	"[pip3] pytorch-ignite==0.4.10\n",
	"[pip3] torch==1.13.1+cu117\n",
	"[pip3] torchaudio==0.13.1+cu117\n",
	"[pip3] torchvision==0.14.1+cu117\n",
	"[conda] blas 1.0 mkl \n",
	"[conda] mkl 2021.4.0 h06a4308_640 \n",
	"[conda] mkl-service 2.4.0 py39h7f8727e_0 \n",
	"[conda] mkl_fft 1.3.1 py39hd3c417c_0 \n",
	"[conda] mkl_random 1.2.2 py39h51133e4_0 \n",
	"[conda] numpy 1.21.5 py39h6c91a56_3 \n",
	"[conda] numpy-base 1.21.5 py39ha15fc14_3 \n",
	"[conda] numpydoc 1.4.0 py39h06a4308_0 \n",
	"[conda] pytorch-ignite 0.4.10 pypi_0 pypi\n",
	"[conda] torch 1.13.1+cu117 pypi_0 pypi\n",
	"[conda] torchaudio 0.13.1+cu117 pypi_0 pypi\n",
	"[conda] torchvision 0.14.1+cu117 pypi_0 pypi\n"
	]
	}
	],
	"source": [
	"! python collect_env.py"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "abe8a9ac",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"OS : Linux-3.10.0-862.el7.x86_64-x86_64-with-glibc2.31\n",
	"Python Version : 3.9.13\n",
	"CuPy Version : 11.4.0\n",
	"CuPy Platform : NVIDIA CUDA\n",
	"NumPy Version : 1.21.5\n",
	"SciPy Version : 1.8.1\n",
	"Cython Build Version : 0.29.32\n",
	"Cython Runtime Version : 0.29.32\n",
	"CUDA Root : /usr/local/cuda\n",
	"nvcc PATH : /usr/local/cuda/bin/nvcc\n",
	"CUDA Build Version : 11080\n",
	"CUDA Driver Version : 11060\n",
	"CUDA Runtime Version : 11070\n",
	"cuBLAS Version : (available)\n",
	"cuFFT Version : 10702\n",
	"cuRAND Version : 10210\n",
	"cuSOLVER Version : (11, 4, 0)\n",
	"cuSPARSE Version : (available)\n",
	"NVRTC Version : (11, 7)\n",
	"Thrust Version : 101501\n",
	"CUB Build Version : 101501\n",
	"Jitify Build Version : 4a37de0\n",
	"cuDNN Build Version : 8600\n",
	"cuDNN Version : 8500\n",
	"NCCL Build Version : 21505\n",
	"NCCL Runtime Version : 21304\n",
	"cuTENSOR Version : None\n",
	"cuSPARSELt Build Version : None\n",
	"Device 0 Name : Tesla V100-SXM2-16GB\n",
	"Device 0 Compute Capability : 70\n",
	"Device 0 PCI Bus ID : 0000:3D:00.0\n",
	"Device 1 Name : Tesla V100-SXM2-16GB\n",
	"Device 1 Compute Capability : 70\n",
	"Device 1 PCI Bus ID : 0000:3E:00.0\n",
	"Device 2 Name : Tesla V100-SXM2-16GB\n",
	"Device 2 Compute Capability : 70\n",
	"Device 2 PCI Bus ID : 0000:B1:00.0\n",
	"Device 3 Name : Tesla V100-SXM2-16GB\n",
	"Device 3 Compute Capability : 70\n",
	"Device 3 PCI Bus ID : 0000:B2:00.0\n"
	]
	}
	],
	"source": [
	"cp.show_config()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "3cdda1a7",
	"metadata": {},
	"outputs": [],
	"source": [
	"def _torch_alloc(size):\n",
	" device = cp.cuda.Device().id\n",
	" tensor = torch.empty(size, dtype=torch.uint8, device=device)\n",
	" return cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(tensor.data_ptr(), size, tensor), 0)\n",
	"\n",
	"cp.cuda.set_allocator(_torch_alloc)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "011cb567",
	"metadata": {},
	"outputs": [],
	"source": [
	"def cupy_inv(x_):\n",
	" x = cp.from_dlpack(x_)\n",
	" return torch.from_dlpack(cp.linalg.inv(x))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "5d3cf714",
	"metadata": {},
	"outputs": [],
	"source": [
	"dtype = torch.complex64"
	]
	},
	{
	"cell_type": "markdown",
	"id": "14c9d1e8",
	"metadata": {},
	"source": [
	"# inv for 100000 x 4 x 4"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "00d60af4",
	"metadata": {},
	"outputs": [],
	"source": [
	"size = (100000, 4, 4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "1d82ccb7",
	"metadata": {},
	"outputs": [],
	"source": [
	"_ = cupy_inv(torch.randn(*size, dtype=dtype, device=\"cuda\")) # initialize CuPy's kernel\n",
	"torch.cuda.synchronize()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "9144c2cb",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"639 µs ± 270 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
	]
	}
	],
	"source": [
	"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
	"y = torch.linalg.inv(x)\n",
	"torch.cuda.synchronize()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "e3162cfe",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"288 µs ± 1.83 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
	]
	}
	],
	"source": [
	"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
	"y = cupy_inv(x)\n",
	"torch.cuda.synchronize()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "f191dccf",
	"metadata": {},
	"source": [
	"# inv for 100000 x 8 x 8"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "68655f98",
	"metadata": {},
	"outputs": [],
	"source": [
	"size = (100000, 8, 8)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"id": "d415cd3a",
	"metadata": {},
	"outputs": [],
	"source": [
	"_ = cupy_inv(torch.randn(*size, dtype=dtype, device=\"cuda\")) # initialize CuPy's kernel\n",
	"torch.cuda.synchronize()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"id": "93362ea7",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1.47 ms ± 358 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
	]
	}
	],
	"source": [
	"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
	"y = torch.linalg.inv(x)\n",
	"torch.cuda.synchronize()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"id": "7cc0aa71",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"697 µs ± 1.33 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
	]
	}
	],
	"source": [
	"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
	"y = cupy_inv(x)\n",
	"torch.cuda.synchronize()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "a67f4a27",
	"metadata": {},
	"source": [
	"# inv for 100000 x 16 x 16"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"id": "082c2aed",
	"metadata": {},
	"outputs": [],
	"source": [
	"size = (100000, 16, 16)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"id": "3dd8c8ca",
	"metadata": {},
	"outputs": [],
	"source": [
	"_ = cupy_inv(torch.randn(*size, dtype=dtype, device=\"cuda\")) # initialize CuPy's kernel\n",
	"torch.cuda.synchronize()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"id": "3272c1af",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"11.9 ms ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
	]
	}
	],
	"source": [
	"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
	"y = torch.linalg.inv(x)\n",
	"torch.cuda.synchronize()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"id": "4714cd63",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"4.39 ms ± 846 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
	]
	}
	],
	"source": [
	"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
	"y = cupy_inv(x)\n",
	"torch.cuda.synchronize()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "e54fb8ed",
	"metadata": {},
	"source": [
	"# inv for 100000 x 32 x 32"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"id": "a75e181f",
	"metadata": {},
	"outputs": [],
	"source": [
	"size = (100000, 32, 32)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"id": "555e2610",
	"metadata": {},
	"outputs": [],
	"source": [
	"_ = cupy_inv(torch.randn(*size, dtype=dtype, device=\"cuda\")) # initialize CuPy's kernel\n",
	"torch.cuda.synchronize()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"id": "99fd1017",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"36.6 ms ± 67.5 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
	"y = torch.linalg.inv(x)\n",
	"torch.cuda.synchronize()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"id": "a57a4192",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"24.5 ms ± 6.64 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
	]
	}
	],
	"source": [
	"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
	"y = cupy_inv(x)\n",
	"torch.cuda.synchronize()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}