Skip to content

Instantly share code, notes, and snippets.

@yoshipon
Created December 30, 2022 13:07
Show Gist options
  • Save yoshipon/dc7e14635d48656c767d47132351eaf6 to your computer and use it in GitHub Desktop.
Save yoshipon/dc7e14635d48656c767d47132351eaf6 to your computer and use it in GitHub Desktop.
torch.linalg.inv
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "cb572805",
"metadata": {},
"outputs": [],
"source": [
"from contextlib import contextmanager\n",
"\n",
"import torch\n",
"import cupy as cp"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ba845d71",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting environment information...\n",
"PyTorch version: 1.13.1+cu117\n",
"Is debug build: False\n",
"CUDA used to build PyTorch: 11.7\n",
"ROCM used to build PyTorch: N/A\n",
"\n",
"OS: Ubuntu 20.04.5 LTS (x86_64)\n",
"GCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
"Clang version: Could not collect\n",
"CMake version: version 3.16.3\n",
"Libc version: glibc-2.31\n",
"\n",
"Python version: 3.9.13 (main, Aug 25 2022, 23:26:10) [GCC 11.2.0] (64-bit runtime)\n",
"Python platform: Linux-3.10.0-862.el7.x86_64-x86_64-with-glibc2.31\n",
"Is CUDA available: True\n",
"CUDA runtime version: 11.7.99\n",
"CUDA_MODULE_LOADING set to: LAZY\n",
"GPU models and configuration: \n",
"GPU 0: Tesla V100-SXM2-16GB\n",
"GPU 1: Tesla V100-SXM2-16GB\n",
"GPU 2: Tesla V100-SXM2-16GB\n",
"GPU 3: Tesla V100-SXM2-16GB\n",
"\n",
"Nvidia driver version: 510.47.03\n",
"cuDNN version: Probably one of the following:\n",
"/usr/lib/x86_64-linux-gnu/libcudnn.so.8.5.0\n",
"/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.5.0\n",
"/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.5.0\n",
"/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.5.0\n",
"/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.5.0\n",
"/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.5.0\n",
"/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.5.0\n",
"HIP runtime version: N/A\n",
"MIOpen runtime version: N/A\n",
"Is XNNPACK available: True\n",
"\n",
"Versions of relevant libraries:\n",
"[pip3] mypy==0.991\n",
"[pip3] mypy-extensions==0.4.3\n",
"[pip3] numpy==1.21.5\n",
"[pip3] numpydoc==1.4.0\n",
"[pip3] pytorch-ignite==0.4.10\n",
"[pip3] torch==1.13.1+cu117\n",
"[pip3] torchaudio==0.13.1+cu117\n",
"[pip3] torchvision==0.14.1+cu117\n",
"[conda] blas 1.0 mkl \n",
"[conda] mkl 2021.4.0 h06a4308_640 \n",
"[conda] mkl-service 2.4.0 py39h7f8727e_0 \n",
"[conda] mkl_fft 1.3.1 py39hd3c417c_0 \n",
"[conda] mkl_random 1.2.2 py39h51133e4_0 \n",
"[conda] numpy 1.21.5 py39h6c91a56_3 \n",
"[conda] numpy-base 1.21.5 py39ha15fc14_3 \n",
"[conda] numpydoc 1.4.0 py39h06a4308_0 \n",
"[conda] pytorch-ignite 0.4.10 pypi_0 pypi\n",
"[conda] torch 1.13.1+cu117 pypi_0 pypi\n",
"[conda] torchaudio 0.13.1+cu117 pypi_0 pypi\n",
"[conda] torchvision 0.14.1+cu117 pypi_0 pypi\n"
]
}
],
"source": [
"! python collect_env.py"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "abe8a9ac",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"OS : Linux-3.10.0-862.el7.x86_64-x86_64-with-glibc2.31\n",
"Python Version : 3.9.13\n",
"CuPy Version : 11.4.0\n",
"CuPy Platform : NVIDIA CUDA\n",
"NumPy Version : 1.21.5\n",
"SciPy Version : 1.8.1\n",
"Cython Build Version : 0.29.32\n",
"Cython Runtime Version : 0.29.32\n",
"CUDA Root : /usr/local/cuda\n",
"nvcc PATH : /usr/local/cuda/bin/nvcc\n",
"CUDA Build Version : 11080\n",
"CUDA Driver Version : 11060\n",
"CUDA Runtime Version : 11070\n",
"cuBLAS Version : (available)\n",
"cuFFT Version : 10702\n",
"cuRAND Version : 10210\n",
"cuSOLVER Version : (11, 4, 0)\n",
"cuSPARSE Version : (available)\n",
"NVRTC Version : (11, 7)\n",
"Thrust Version : 101501\n",
"CUB Build Version : 101501\n",
"Jitify Build Version : 4a37de0\n",
"cuDNN Build Version : 8600\n",
"cuDNN Version : 8500\n",
"NCCL Build Version : 21505\n",
"NCCL Runtime Version : 21304\n",
"cuTENSOR Version : None\n",
"cuSPARSELt Build Version : None\n",
"Device 0 Name : Tesla V100-SXM2-16GB\n",
"Device 0 Compute Capability : 70\n",
"Device 0 PCI Bus ID : 0000:3D:00.0\n",
"Device 1 Name : Tesla V100-SXM2-16GB\n",
"Device 1 Compute Capability : 70\n",
"Device 1 PCI Bus ID : 0000:3E:00.0\n",
"Device 2 Name : Tesla V100-SXM2-16GB\n",
"Device 2 Compute Capability : 70\n",
"Device 2 PCI Bus ID : 0000:B1:00.0\n",
"Device 3 Name : Tesla V100-SXM2-16GB\n",
"Device 3 Compute Capability : 70\n",
"Device 3 PCI Bus ID : 0000:B2:00.0\n"
]
}
],
"source": [
"cp.show_config()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3cdda1a7",
"metadata": {},
"outputs": [],
"source": [
"def _torch_alloc(size):\n",
" device = cp.cuda.Device().id\n",
" tensor = torch.empty(size, dtype=torch.uint8, device=device)\n",
" return cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(tensor.data_ptr(), size, tensor), 0)\n",
"\n",
"cp.cuda.set_allocator(_torch_alloc)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "011cb567",
"metadata": {},
"outputs": [],
"source": [
"def cupy_inv(x_):\n",
" x = cp.from_dlpack(x_)\n",
" return torch.from_dlpack(cp.linalg.inv(x))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "5d3cf714",
"metadata": {},
"outputs": [],
"source": [
"dtype = torch.complex64"
]
},
{
"cell_type": "markdown",
"id": "14c9d1e8",
"metadata": {},
"source": [
"# inv for 100000 x 4 x 4"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "00d60af4",
"metadata": {},
"outputs": [],
"source": [
"size = (100000, 4, 4)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1d82ccb7",
"metadata": {},
"outputs": [],
"source": [
"_ = cupy_inv(torch.randn(*size, dtype=dtype, device=\"cuda\")) # initialize CuPy's kernel\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "9144c2cb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"639 µs ± 270 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
]
}
],
"source": [
"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
"y = torch.linalg.inv(x)\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "e3162cfe",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"288 µs ± 1.83 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
]
}
],
"source": [
"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
"y = cupy_inv(x)\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "markdown",
"id": "f191dccf",
"metadata": {},
"source": [
"# inv for 100000 x 8 x 8"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "68655f98",
"metadata": {},
"outputs": [],
"source": [
"size = (100000, 8, 8)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "d415cd3a",
"metadata": {},
"outputs": [],
"source": [
"_ = cupy_inv(torch.randn(*size, dtype=dtype, device=\"cuda\")) # initialize CuPy's kernel\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "93362ea7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.47 ms ± 358 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
]
}
],
"source": [
"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
"y = torch.linalg.inv(x)\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "7cc0aa71",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"697 µs ± 1.33 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
]
}
],
"source": [
"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
"y = cupy_inv(x)\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "markdown",
"id": "a67f4a27",
"metadata": {},
"source": [
"# inv for 100000 x 16 x 16"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "082c2aed",
"metadata": {},
"outputs": [],
"source": [
"size = (100000, 16, 16)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "3dd8c8ca",
"metadata": {},
"outputs": [],
"source": [
"_ = cupy_inv(torch.randn(*size, dtype=dtype, device=\"cuda\")) # initialize CuPy's kernel\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "3272c1af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"11.9 ms ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
"y = torch.linalg.inv(x)\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "4714cd63",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4.39 ms ± 846 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
"y = cupy_inv(x)\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "markdown",
"id": "e54fb8ed",
"metadata": {},
"source": [
"# inv for 100000 x 32 x 32"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "a75e181f",
"metadata": {},
"outputs": [],
"source": [
"size = (100000, 32, 32)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "555e2610",
"metadata": {},
"outputs": [],
"source": [
"_ = cupy_inv(torch.randn(*size, dtype=dtype, device=\"cuda\")) # initialize CuPy's kernel\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "99fd1017",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"36.6 ms ± 67.5 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
"y = torch.linalg.inv(x)\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "a57a4192",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"24.5 ms ± 6.64 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%%timeit x = torch.randn(*size, dtype=dtype, device=\"cuda\"); torch.cuda.synchronize()\n",
"y = cupy_inv(x)\n",
"torch.cuda.synchronize()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment