Skip to content

Instantly share code, notes, and snippets.

@ypwhs
Last active November 24, 2018 18:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ypwhs/a156353cb19e780e323be054821af015 to your computer and use it in GitHub Desktop.
Save ypwhs/a156353cb19e780e323be054821af015 to your computer and use it in GitHub Desktop.

PyTorch Benchmark

Environment

  • torch version: 1.0.0a0+ff608a9
  • CUDA version: 10.0.130
  • DISTRIB_DESCRIPTION="Ubuntu 16.04 LTS"

Layer config

Linear

  • Input shape: torch.Size([1, 4096, 4096])
  • Op: Linear(in_features=4096, out_features=4096, bias=False)
  • 128.000GFLOP

Conv2d

  • Input shape: torch.Size([1, 256, 256, 256])
  • Op: Conv2d(256, 256, kernel_size=(8, 8), stride=(1, 1), bias=False)
  • 484.383GFLOP

Result

GPU Model FP32 Linear FP16 Linear Linear Speedup FP32 Conv2d FP16 Conv2d Conv2d Speedup
TITAN V 12.9TFLOPS 77.4TFLOPS 598.61% 8.71TFLOPS 76.4TFLOPS 875.61%
RTX 2080 Ti 12.7TFLOPS 53.4TFLOPS 419.31% 6.20TFLOPS 51.9TFLOPS 750.22%
RTX 2080 9.06TFLOPS 36.2TFLOPS 398.94% 4.95TFLOPS 35.8TFLOPS 721.24%
GTX 1080 Ti 10.5TFLOPS 10.3TFLOPS 98.74% 6.29TFLOPS 7.14TFLOPS 108.55%
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch version: 1.0.0a0+ff608a9\n",
"CUDA version: 10.0.130\n",
"DISTRIB_DESCRIPTION=\"Ubuntu 16.04 LTS\"\r\n"
]
}
],
"source": [
"import sys\n",
"import torch\n",
"from tqdm import tqdm\n",
"\n",
"print('torch version:', torch.__version__)\n",
"print('CUDA version:', (torch.version.cuda))\n",
"!tail -n 1 /etc/lsb-release"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"class TorchBenchmark:\n",
" def f(self):\n",
" with torch.no_grad():\n",
" while True:\n",
" self.op(self.dummy)\n",
" torch.cuda.synchronize()\n",
" yield self.FLOP\n",
" \n",
" def test(self, tflop=100):\n",
" if self.half:\n",
" tflop *= 4\n",
" sys.stdout.flush()\n",
" total = tflop * 1024 ** 4\n",
" with tqdm(self.f(), total=total, unit='FLOP', unit_scale=True, unit_divisor=1024) as pbar:\n",
" for x in pbar:\n",
" if pbar.n + x > total:\n",
" pbar.update(total - pbar.n)\n",
" break\n",
" else:\n",
" pbar.update(x)\n",
" \n",
" mean_speed = pbar.last_print_n / (pbar.last_print_t - pbar.start_t) / (1024 ** 4)\n",
" return mean_speed\n",
" \n",
" def describe(self):\n",
" print('Input shape:',self.dummy.shape)\n",
" print('Op:', self.op)\n",
" print(f'{self.FLOP / (1024 ** 3):.3f}GFLOP')\n",
"\n",
"class TorchBenchmarkLinear(TorchBenchmark):\n",
" def __init__(self, a, b, c, bias=False, half=False):\n",
" super(TorchBenchmarkLinear, self).__init__()\n",
" \n",
" self.half = half\n",
" self.dummy = torch.randn((1, a, b)).cuda()\n",
" self.op = torch.nn.Linear(b, c, bias=bias).cuda()\n",
" if half:\n",
" self.dummy = self.dummy.half()\n",
" self.op = self.op.half()\n",
" self.FLOP = a * b * c * 2\n",
"\n",
"class TorchBenchmarkConv2d(TorchBenchmark):\n",
" def __init__(self, width, in_channels, out_channels, kernel_size, bias=False, half=False):\n",
" super(TorchBenchmarkConv2d, self).__init__()\n",
" self.half = half\n",
" self.dummy = torch.randn((1, in_channels, width, width)).cuda()\n",
" self.op = torch.nn.Conv2d(in_channels, out_channels, kernel_size, bias=bias).cuda()\n",
" if half:\n",
" self.dummy = self.dummy.half()\n",
" self.op = self.op.half()\n",
" \n",
" width2 = width - kernel_size + 1\n",
" self.FLOP = kernel_size ** 2 * in_channels * 2 * out_channels * width2 * width2"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def test(device_id=0):\n",
" torch.cuda.set_device(device_id)\n",
" print(torch.cuda.get_device_properties(device_id))\n",
"\n",
" A = 2**12\n",
" bm = TorchBenchmarkLinear(A, A, A)\n",
" fp32_speed = bm.test()\n",
" fp16_speed = TorchBenchmarkLinear(A, A, A, half=True).test()\n",
" bm.describe()\n",
" print(f'Speedup ratio: {fp16_speed / fp32_speed * 100:.2f}%')\n",
" \n",
" W = 2 ** 8\n",
" C = 2 ** 8\n",
" K = 2 ** 3\n",
" bm = TorchBenchmarkConv2d(W, C, C, K, half=False)\n",
" fp32_speed = bm.test()\n",
" fp16_speed = TorchBenchmarkConv2d(W, C, C, K, half=True).test()\n",
" bm.describe()\n",
" print(f'Speedup ratio: {fp16_speed / fp32_speed * 100:.2f}%\\n\\n')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# warm up\n",
"for i in range(4):\n",
" torch.cuda.set_device(i)\n",
" \n",
" A = 2 ** 12\n",
" X = torch.randn((A, A)).cuda()\n",
" torch.matmul(X, X)\n",
" \n",
" A = 2 ** 8\n",
" B = 2 ** 3\n",
" X = torch.randn((1, A, A, A)).cuda()\n",
" torch.nn.Conv2d(A, A, B).cuda()(X)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"_CudaDeviceProperties(name='TITAN V', major=7, minor=0, total_memory=12036MB, multi_processor_count=80)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 100T/100T [00:08<00:00, 12.9TFLOP/s] \n",
"100%|██████████| 400T/400T [00:05<00:00, 77.4TFLOP/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input shape: torch.Size([1, 4096, 4096])\n",
"Op: Linear(in_features=4096, out_features=4096, bias=False)\n",
"128.000GFLOP\n",
"Speedup ratio: 598.61%\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 100T/100T [00:12<00:00, 8.71TFLOP/s] \n",
"100%|██████████| 400T/400T [00:05<00:00, 76.4TFLOP/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input shape: torch.Size([1, 256, 256, 256])\n",
"Op: Conv2d(256, 256, kernel_size=(8, 8), stride=(1, 1), bias=False)\n",
"484.383GFLOP\n",
"Speedup ratio: 875.61%\n",
"\n",
"\n",
"_CudaDeviceProperties(name='GeForce RTX 2080', major=7, minor=5, total_memory=7952MB, multi_processor_count=46)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 100T/100T [00:12<00:00, 9.06TFLOP/s] \n",
"100%|██████████| 400T/400T [00:12<00:00, 36.2TFLOP/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input shape: torch.Size([1, 4096, 4096])\n",
"Op: Linear(in_features=4096, out_features=4096, bias=False)\n",
"128.000GFLOP\n",
"Speedup ratio: 398.94%\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 100T/100T [00:22<00:00, 4.95TFLOP/s] \n",
"100%|██████████| 400T/400T [00:12<00:00, 35.8TFLOP/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input shape: torch.Size([1, 256, 256, 256])\n",
"Op: Conv2d(256, 256, kernel_size=(8, 8), stride=(1, 1), bias=False)\n",
"484.383GFLOP\n",
"Speedup ratio: 721.24%\n",
"\n",
"\n",
"_CudaDeviceProperties(name='GeForce GTX 1080 Ti', major=6, minor=1, total_memory=11178MB, multi_processor_count=28)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 100T/100T [00:10<00:00, 10.5TFLOP/s] \n",
"100%|██████████| 400T/400T [00:42<00:00, 10.3TFLOP/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input shape: torch.Size([1, 4096, 4096])\n",
"Op: Linear(in_features=4096, out_features=4096, bias=False)\n",
"128.000GFLOP\n",
"Speedup ratio: 98.47%\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 100T/100T [00:15<00:00, 6.29TFLOP/s] \n",
"100%|██████████| 400T/400T [00:57<00:00, 7.14TFLOP/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input shape: torch.Size([1, 256, 256, 256])\n",
"Op: Conv2d(256, 256, kernel_size=(8, 8), stride=(1, 1), bias=False)\n",
"484.383GFLOP\n",
"Speedup ratio: 108.55%\n",
"\n",
"\n",
"_CudaDeviceProperties(name='GeForce RTX 2080 Ti', major=7, minor=5, total_memory=10989MB, multi_processor_count=68)"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 100T/100T [00:08<00:00, 12.7TFLOP/s] \n",
"100%|██████████| 400T/400T [00:08<00:00, 53.4TFLOP/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input shape: torch.Size([1, 4096, 4096])\n",
"Op: Linear(in_features=4096, out_features=4096, bias=False)\n",
"128.000GFLOP\n",
"Speedup ratio: 419.31%\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 100T/100T [00:15<00:00, 6.20TFLOP/s] \n",
"100%|██████████| 400T/400T [00:08<00:00, 51.9TFLOP/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input shape: torch.Size([1, 256, 256, 256])\n",
"Op: Conv2d(256, 256, kernel_size=(8, 8), stride=(1, 1), bias=False)\n",
"484.383GFLOP\n",
"Speedup ratio: 750.22%\n",
"\n",
"\n"
]
}
],
"source": [
"for i in range(4):\n",
" test(i)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment