ilia-cher

## gist:8f655cf15beb1b11547fd3564a1c3958
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision
import torchvision.transforms as T
import torchvision.datasets as datasets
import torchvision.models as models

## gist:0e78d0440fe02b77ff6721571c14f01c
import torch
from torch.profiler import *

for with_cuda in [True, False]:
    with profile() as prof:
        x = torch.randn(2, 2)
        if with_cuda:
            x = x.cuda()
        x = x.matmul(x)

## gist:1fa37ad4a5149fb5eeda1245c5583c69
(pytorch) iliacher@devgpu083:~/local/pytorch  (activities_default)$ python test_resnet50.py
Files already downloaded and verified
step:0
step:1
step:2
step:3
step:4
step:5
step:6
step:7

## gist:5189904a08c4a12b24de1bf2ff9f1296
before:

~/local/pytorch  (flops_warnings)$ python benchmarks/profiler_benchmark/profiler_bench.py
Payload: loop, 256 iterations; timer min. runtime = 10

Profiling disabled, tensor size 1x1, use cuda: False, use kineto: False, with stacks: False, use script: False
<torch.utils.benchmark.utils.common.Measurement object at 0x7fcc4ca50490>
payload()
  Median: 688.50 us
  IQR:    7.70 us (684.29 to 691.99)

## gist:09daa9876082af615bb35b71683bf46e
commit edc815cb94e4a1cc501cda87c6e05a73137e4593 (HEAD -> extra_sampling_2, origin/gh/ilia-cher/89/orig)
Author: ilia-cher <iliacher@fb.com>
Date:   Wed Dec 9 14:34:44 2020 -0800


(pytorch) iliacher@devgpu083:~/local/pytorch  (extra_sampling_2)$ python
Python 3.8.5 (default, Sep  4 2020, 07:30:14)
[GCC 7.3.0] :: Anaconda, Inc. on linux
Type "help", "copyright", "credits" or "license" for more information.

## gist:b42ef2727ed85a91d4a9c6a938e49695
iliacher@devgpu083:~/fbcode  (20ae4497)$ ./buck-out/gen/caffe2/binaries/record_function_benchmark
Warm up
Tensor GEMM benchmark (1x1, 10000): 22792 us.
Tensor GEMM benchmark (16x16, 10000): 31387 us.
Pure RecordFunction benchmark (10000): 44 us.
Running without observers
Tensor GEMM benchmark (1x1, 10000): 7626 us.
Tensor GEMM benchmark (16x16, 10000): 10927 us.
Pure RecordFunction benchmark (10000): 84 us.
WARNING: Logging before InitGoogleLogging() is written to STDERR

## gist:2a33a207194810c6be73421db8e3f6c6
(pytorch) iliacher@devgpu083:~/local/pytorch  (feeee76e)$ ./build/bin/record_function_benchmark
Warmup time: 335 us.
Running without observers
Tensor GEMM benchmark (1x1, 10000): 11665 us.
Tensor GEMM benchmark (16x16, 10000): 52187 us.
Pure RecordFunction benchmark (10000): 155 us.
Running with empty observers
Tensor GEMM benchmark (1x1, 10000): 21440 us.
Tensor GEMM benchmark (16x16, 10000): 61519 us.
Pure RecordFunction benchmark (10000): 1561 us.

## gist:e988a43dc9a444ae8caa68f3e6b0a294
----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ---------------------------------------------------------------------------
                        Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  Source Location
----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ---------------------------------------------------------------------------
    aten::mkldnn_convolution        98.47%      30.425ms        99.07%      30.610ms      30.610ms             1  ...s/iliacher/pytorch/torch/nn/modules/conv.py(389): _conv_forward (Conv2d)
                                                                                                                  ...a/users/iliacher/pytorch/torch/nn/modules/conv.py(393): forward (Conv2d)


## gist:a5a9eb6b68504542a3cad5150fc39b1a
python benchmarks/profiler_benchmark/profiler_bench.py --with_cuda
Payload: loop, 256 iterations; timer min. runtime = 10

Profiling disabled, tensor size 1x1, use cuda: True, use kineto: False, with stacks: False, use script: False
<torch.utils.benchmark.utils.common.Measurement object at 0x7fb85eb71b80>
payload()
  Median: 3.20 ms
  IQR:    0.16 ms (3.11 to 3.27)
  3127 measurements, 1 runs per measurement, 1 thread
Profiling enabled, tensor size 1x1, use cuda: True, use kineto: False, with stacks: False, use script: False

## gist:69a01a6ef294511a797e8cf4a1e6db75
USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 USE_CUDA=0 python setup.py develop install --cmake

base:
python benchmarks/profiler_benchmark/profiler_bench.py --use_timer

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7f082441f0d0>
payload()
	import torch
	import torch.nn as nn
	import torch.nn.parallel
	import torch.backends.cudnn as cudnn
	import torch.optim
	import torch.utils.data
	import torchvision
	import torchvision.transforms as T
	import torchvision.datasets as datasets
	import torchvision.models as models
	import torch
	from torch.profiler import *

	for with_cuda in [True, False]:
	with profile() as prof:
	x = torch.randn(2, 2)
	if with_cuda:
	x = x.cuda()
	x = x.matmul(x)
	(pytorch) iliacher@devgpu083:~/local/pytorch (activities_default)$ python test_resnet50.py
	Files already downloaded and verified
	step:0
	step:1
	step:2
	step:3
	step:4
	step:5
	step:6
	step:7
	before:

	~/local/pytorch (flops_warnings)$ python benchmarks/profiler_benchmark/profiler_bench.py
	Payload: loop, 256 iterations; timer min. runtime = 10

	Profiling disabled, tensor size 1x1, use cuda: False, use kineto: False, with stacks: False, use script: False
	<torch.utils.benchmark.utils.common.Measurement object at 0x7fcc4ca50490>
	payload()
	Median: 688.50 us
	IQR: 7.70 us (684.29 to 691.99)
	commit edc815cb94e4a1cc501cda87c6e05a73137e4593 (HEAD -> extra_sampling_2, origin/gh/ilia-cher/89/orig)
	Author: ilia-cher <iliacher@fb.com>
	Date: Wed Dec 9 14:34:44 2020 -0800



	(pytorch) iliacher@devgpu083:~/local/pytorch (extra_sampling_2)$ python
	Python 3.8.5 (default, Sep 4 2020, 07:30:14)
	[GCC 7.3.0] :: Anaconda, Inc. on linux
	Type "help", "copyright", "credits" or "license" for more information.
	iliacher@devgpu083:~/fbcode (20ae4497)$ ./buck-out/gen/caffe2/binaries/record_function_benchmark
	Warm up
	Tensor GEMM benchmark (1x1, 10000): 22792 us.
	Tensor GEMM benchmark (16x16, 10000): 31387 us.
	Pure RecordFunction benchmark (10000): 44 us.
	Running without observers
	Tensor GEMM benchmark (1x1, 10000): 7626 us.
	Tensor GEMM benchmark (16x16, 10000): 10927 us.
	Pure RecordFunction benchmark (10000): 84 us.
	WARNING: Logging before InitGoogleLogging() is written to STDERR
	(pytorch) iliacher@devgpu083:~/local/pytorch (feeee76e)$ ./build/bin/record_function_benchmark
	Warmup time: 335 us.
	Running without observers
	Tensor GEMM benchmark (1x1, 10000): 11665 us.
	Tensor GEMM benchmark (16x16, 10000): 52187 us.
	Pure RecordFunction benchmark (10000): 155 us.
	Running with empty observers
	Tensor GEMM benchmark (1x1, 10000): 21440 us.
	Tensor GEMM benchmark (16x16, 10000): 61519 us.
	Pure RecordFunction benchmark (10000): 1561 us.
	---------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ---------------------------------------------------------------------------
	Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls Source Location
	---------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ---------------------------------------------------------------------------
	aten::mkldnn_convolution 98.47% 30.425ms 99.07% 30.610ms 30.610ms 1 ...s/iliacher/pytorch/torch/nn/modules/conv.py(389): _conv_forward (Conv2d)
	...a/users/iliacher/pytorch/torch/nn/modules/conv.py(393): forward (Conv2d)
	python benchmarks/profiler_benchmark/profiler_bench.py --with_cuda
	Payload: loop, 256 iterations; timer min. runtime = 10

	Profiling disabled, tensor size 1x1, use cuda: True, use kineto: False, with stacks: False, use script: False
	<torch.utils.benchmark.utils.common.Measurement object at 0x7fb85eb71b80>
	payload()
	Median: 3.20 ms
	IQR: 0.16 ms (3.11 to 3.27)
	3127 measurements, 1 runs per measurement, 1 thread
	Profiling enabled, tensor size 1x1, use cuda: True, use kineto: False, with stacks: False, use script: False
	USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 USE_CUDA=0 python setup.py develop install --cmake

	base:
	python benchmarks/profiler_benchmark/profiler_bench.py --use_timer

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7f082441f0d0>
	payload()