|
{ |
|
"cells": [ |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 1, |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"torch version: 1.0.0a0+ff608a9\n", |
|
"CUDA version: 10.0.130\n", |
|
"DISTRIB_DESCRIPTION=\"Ubuntu 16.04 LTS\"\r\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"import sys\n", |
|
"import torch\n", |
|
"from tqdm import tqdm\n", |
|
"\n", |
|
"print('torch version:', torch.__version__)\n", |
|
"print('CUDA version:', (torch.version.cuda))\n", |
|
"!tail -n 1 /etc/lsb-release" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 2, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"class TorchBenchmark:\n", |
|
" def f(self):\n", |
|
" with torch.no_grad():\n", |
|
" while True:\n", |
|
" self.op(self.dummy)\n", |
|
" torch.cuda.synchronize()\n", |
|
" yield self.FLOP\n", |
|
" \n", |
|
" def test(self, tflop=100):\n", |
|
" if self.half:\n", |
|
" tflop *= 4\n", |
|
" sys.stdout.flush()\n", |
|
" total = tflop * 1024 ** 4\n", |
|
" with tqdm(self.f(), total=total, unit='FLOP', unit_scale=True, unit_divisor=1024) as pbar:\n", |
|
" for x in pbar:\n", |
|
" if pbar.n + x > total:\n", |
|
" pbar.update(total - pbar.n)\n", |
|
" break\n", |
|
" else:\n", |
|
" pbar.update(x)\n", |
|
" \n", |
|
" mean_speed = pbar.last_print_n / (pbar.last_print_t - pbar.start_t) / (1024 ** 4)\n", |
|
" return mean_speed\n", |
|
" \n", |
|
" def describe(self):\n", |
|
" print('Input shape:',self.dummy.shape)\n", |
|
" print('Op:', self.op)\n", |
|
" print(f'{self.FLOP / (1024 ** 3):.3f}GFLOP')\n", |
|
"\n", |
|
"class TorchBenchmarkLinear(TorchBenchmark):\n", |
|
" def __init__(self, a, b, c, bias=False, half=False):\n", |
|
" super(TorchBenchmarkLinear, self).__init__()\n", |
|
" \n", |
|
" self.half = half\n", |
|
" self.dummy = torch.randn((1, a, b)).cuda()\n", |
|
" self.op = torch.nn.Linear(b, c, bias=bias).cuda()\n", |
|
" if half:\n", |
|
" self.dummy = self.dummy.half()\n", |
|
" self.op = self.op.half()\n", |
|
" self.FLOP = a * b * c * 2\n", |
|
"\n", |
|
"class TorchBenchmarkConv2d(TorchBenchmark):\n", |
|
" def __init__(self, width, in_channels, out_channels, kernel_size, bias=False, half=False):\n", |
|
" super(TorchBenchmarkConv2d, self).__init__()\n", |
|
" self.half = half\n", |
|
" self.dummy = torch.randn((1, in_channels, width, width)).cuda()\n", |
|
" self.op = torch.nn.Conv2d(in_channels, out_channels, kernel_size, bias=bias).cuda()\n", |
|
" if half:\n", |
|
" self.dummy = self.dummy.half()\n", |
|
" self.op = self.op.half()\n", |
|
" \n", |
|
" width2 = width - kernel_size + 1\n", |
|
" self.FLOP = kernel_size ** 2 * in_channels * 2 * out_channels * width2 * width2" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 3, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"def test(device_id=0):\n", |
|
" torch.cuda.set_device(device_id)\n", |
|
" print(torch.cuda.get_device_properties(device_id))\n", |
|
"\n", |
|
" A = 2**12\n", |
|
" bm = TorchBenchmarkLinear(A, A, A)\n", |
|
" fp32_speed = bm.test()\n", |
|
" fp16_speed = TorchBenchmarkLinear(A, A, A, half=True).test()\n", |
|
" bm.describe()\n", |
|
" print(f'Speedup ratio: {fp16_speed / fp32_speed * 100:.2f}%')\n", |
|
" \n", |
|
" W = 2 ** 8\n", |
|
" C = 2 ** 8\n", |
|
" K = 2 ** 3\n", |
|
" bm = TorchBenchmarkConv2d(W, C, C, K, half=False)\n", |
|
" fp32_speed = bm.test()\n", |
|
" fp16_speed = TorchBenchmarkConv2d(W, C, C, K, half=True).test()\n", |
|
" bm.describe()\n", |
|
" print(f'Speedup ratio: {fp16_speed / fp32_speed * 100:.2f}%\\n\\n')" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 4, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"# warm up\n", |
|
"for i in range(4):\n", |
|
" torch.cuda.set_device(i)\n", |
|
" \n", |
|
" A = 2 ** 12\n", |
|
" X = torch.randn((A, A)).cuda()\n", |
|
" torch.matmul(X, X)\n", |
|
" \n", |
|
" A = 2 ** 8\n", |
|
" B = 2 ** 3\n", |
|
" X = torch.randn((1, A, A, A)).cuda()\n", |
|
" torch.nn.Conv2d(A, A, B).cuda()(X)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 5, |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"_CudaDeviceProperties(name='TITAN V', major=7, minor=0, total_memory=12036MB, multi_processor_count=80)\n" |
|
] |
|
}, |
|
{ |
|
"name": "stderr", |
|
"output_type": "stream", |
|
"text": [ |
|
"100%|██████████| 100T/100T [00:08<00:00, 12.9TFLOP/s] \n", |
|
"100%|██████████| 400T/400T [00:05<00:00, 77.4TFLOP/s] \n" |
|
] |
|
}, |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"Input shape: torch.Size([1, 4096, 4096])\n", |
|
"Op: Linear(in_features=4096, out_features=4096, bias=False)\n", |
|
"128.000GFLOP\n", |
|
"Speedup ratio: 598.61%\n" |
|
] |
|
}, |
|
{ |
|
"name": "stderr", |
|
"output_type": "stream", |
|
"text": [ |
|
"100%|██████████| 100T/100T [00:12<00:00, 8.71TFLOP/s] \n", |
|
"100%|██████████| 400T/400T [00:05<00:00, 76.4TFLOP/s] \n" |
|
] |
|
}, |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"Input shape: torch.Size([1, 256, 256, 256])\n", |
|
"Op: Conv2d(256, 256, kernel_size=(8, 8), stride=(1, 1), bias=False)\n", |
|
"484.383GFLOP\n", |
|
"Speedup ratio: 875.61%\n", |
|
"\n", |
|
"\n", |
|
"_CudaDeviceProperties(name='GeForce RTX 2080', major=7, minor=5, total_memory=7952MB, multi_processor_count=46)\n" |
|
] |
|
}, |
|
{ |
|
"name": "stderr", |
|
"output_type": "stream", |
|
"text": [ |
|
"100%|██████████| 100T/100T [00:12<00:00, 9.06TFLOP/s] \n", |
|
"100%|██████████| 400T/400T [00:12<00:00, 36.2TFLOP/s] \n" |
|
] |
|
}, |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"Input shape: torch.Size([1, 4096, 4096])\n", |
|
"Op: Linear(in_features=4096, out_features=4096, bias=False)\n", |
|
"128.000GFLOP\n", |
|
"Speedup ratio: 398.94%\n" |
|
] |
|
}, |
|
{ |
|
"name": "stderr", |
|
"output_type": "stream", |
|
"text": [ |
|
"100%|██████████| 100T/100T [00:22<00:00, 4.95TFLOP/s] \n", |
|
"100%|██████████| 400T/400T [00:12<00:00, 35.8TFLOP/s] \n" |
|
] |
|
}, |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"Input shape: torch.Size([1, 256, 256, 256])\n", |
|
"Op: Conv2d(256, 256, kernel_size=(8, 8), stride=(1, 1), bias=False)\n", |
|
"484.383GFLOP\n", |
|
"Speedup ratio: 721.24%\n", |
|
"\n", |
|
"\n", |
|
"_CudaDeviceProperties(name='GeForce GTX 1080 Ti', major=6, minor=1, total_memory=11178MB, multi_processor_count=28)\n" |
|
] |
|
}, |
|
{ |
|
"name": "stderr", |
|
"output_type": "stream", |
|
"text": [ |
|
"100%|██████████| 100T/100T [00:10<00:00, 10.5TFLOP/s] \n", |
|
"100%|██████████| 400T/400T [00:42<00:00, 10.3TFLOP/s] \n" |
|
] |
|
}, |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"Input shape: torch.Size([1, 4096, 4096])\n", |
|
"Op: Linear(in_features=4096, out_features=4096, bias=False)\n", |
|
"128.000GFLOP\n", |
|
"Speedup ratio: 98.47%\n" |
|
] |
|
}, |
|
{ |
|
"name": "stderr", |
|
"output_type": "stream", |
|
"text": [ |
|
"100%|██████████| 100T/100T [00:15<00:00, 6.29TFLOP/s] \n", |
|
"100%|██████████| 400T/400T [00:57<00:00, 7.14TFLOP/s] " |
|
] |
|
}, |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"Input shape: torch.Size([1, 256, 256, 256])\n", |
|
"Op: Conv2d(256, 256, kernel_size=(8, 8), stride=(1, 1), bias=False)\n", |
|
"484.383GFLOP\n", |
|
"Speedup ratio: 108.55%\n", |
|
"\n", |
|
"\n", |
|
"_CudaDeviceProperties(name='GeForce RTX 2080 Ti', major=7, minor=5, total_memory=10989MB, multi_processor_count=68)" |
|
] |
|
}, |
|
{ |
|
"name": "stderr", |
|
"output_type": "stream", |
|
"text": [ |
|
"100%|██████████| 100T/100T [00:08<00:00, 12.7TFLOP/s] \n", |
|
"100%|██████████| 400T/400T [00:08<00:00, 53.4TFLOP/s] " |
|
] |
|
}, |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"Input shape: torch.Size([1, 4096, 4096])\n", |
|
"Op: Linear(in_features=4096, out_features=4096, bias=False)\n", |
|
"128.000GFLOP\n", |
|
"Speedup ratio: 419.31%\n" |
|
] |
|
}, |
|
{ |
|
"name": "stderr", |
|
"output_type": "stream", |
|
"text": [ |
|
"100%|██████████| 100T/100T [00:15<00:00, 6.20TFLOP/s] \n", |
|
"100%|██████████| 400T/400T [00:08<00:00, 51.9TFLOP/s] " |
|
] |
|
}, |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"Input shape: torch.Size([1, 256, 256, 256])\n", |
|
"Op: Conv2d(256, 256, kernel_size=(8, 8), stride=(1, 1), bias=False)\n", |
|
"484.383GFLOP\n", |
|
"Speedup ratio: 750.22%\n", |
|
"\n", |
|
"\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"for i in range(4):\n", |
|
" test(i)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [] |
|
} |
|
], |
|
"metadata": { |
|
"kernelspec": { |
|
"display_name": "Python 3", |
|
"language": "python", |
|
"name": "python3" |
|
}, |
|
"language_info": { |
|
"codemirror_mode": { |
|
"name": "ipython", |
|
"version": 3 |
|
}, |
|
"file_extension": ".py", |
|
"mimetype": "text/x-python", |
|
"name": "python", |
|
"nbconvert_exporter": "python", |
|
"pygments_lexer": "ipython3", |
|
"version": "3.7.0" |
|
} |
|
}, |
|
"nbformat": 4, |
|
"nbformat_minor": 2 |
|
} |