ilia-cher/gist:69a01a6ef294511a797e8cf4a1e6db75

## gistfile1.txt
USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 USE_CUDA=0 python setup.py develop install --cmake

base:
python benchmarks/profiler_benchmark/profiler_bench.py --use_timer

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7f082441f0d0>
payload()
  Median: 874.28 us
  IQR:    21.81 us (863.57 to 885.39)
  115 measurements, 1000 runs per measurement, 1 thread

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7fea1fa030d0>
payload()
  Median: 853.30 us
  IQR:    23.12 us (839.39 to 862.51)
  118 measurements, 1000 runs per measurement, 1 thread

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7fa910ece0d0>
payload()
  Median: 944.96 us
  IQR:    21.73 us (932.26 to 953.98)
  106 measurements, 1000 runs per measurement, 1 thread

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7f985d0350d0>
payload()
  Median: 824.83 us
  IQR:    22.66 us (810.02 to 832.68)
  122 measurements, 1000 runs per measurement, 1 thread

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7fec482f90d0>
payload()
  Median: 882.59 us
  IQR:    23.45 us (872.44 to 895.89)
  114 measurements, 1000 runs per measurement, 1 thread

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7f06f98030d0>
payload()
  Median: 965.52 us
  IQR:    7.39 us (960.24 to 967.63)
  104 measurements, 1000 runs per measurement, 1 thread

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7f6ffaf100d0>
payload()
  Median: 935.27 us
  IQR:    7.91 us (929.42 to 937.32)
  108 measurements, 1000 runs per measurement, 1 thread

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7f4fc56c60d0>
payload()
  Median: 852.49 us
  IQR:    16.90 us (844.33 to 861.22)
  118 measurements, 1000 runs per measurement, 1 thread


PR:
python benchmarks/profiler_benchmark/profiler_bench.py --use_timer

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7f0621be3310>
payload()
  Median: 846.23 us
  IQR:    18.41 us (838.14 to 856.55)
  119 measurements, 1000 runs per measurement, 1 thread

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7f1dada4b310>
payload()
  Median: 866.56 us
  IQR:    19.08 us (857.38 to 876.46)
  116 measurements, 1000 runs per measurement, 1 thread

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7f05c84b0310>
payload()
  Median: 881.75 us
  IQR:    19.57 us (871.79 to 891.37)
  114 measurements, 1000 runs per measurement, 1 thread

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7f5b36b45310>
payload()
  Median: 931.29 us
  IQR:    22.71 us (921.67 to 944.38)
  108 measurements, 1000 runs per measurement, 1 thread

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7ff6652ce310>
payload()
  Median: 885.31 us
  IQR:    16.96 us (875.23 to 892.20)
  114 measurements, 1000 runs per measurement, 1 thread

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7ffa8819e310>
payload()
  Median: 853.89 us
  IQR:    21.47 us (843.78 to 865.25)
  118 measurements, 1000 runs per measurement, 1 thread

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7fd80c581310>
payload()
  Median: 891.49 us
  IQR:    21.51 us (881.16 to 902.67)
  113 measurements, 1000 runs per measurement, 1 thread

Payload: loop; 256 iterations, N = 100

Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
<torch.utils._benchmark.utils.common.Measurement object at 0x7f3a33525310>
payload()
  Median: 876.65 us
  IQR:    19.72 us (865.53 to 885.26)
  115 measurements, 1000 runs per measurement, 1 thread
	USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 USE_CUDA=0 python setup.py develop install --cmake

	base:
	python benchmarks/profiler_benchmark/profiler_bench.py --use_timer

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7f082441f0d0>
	payload()
	Median: 874.28 us
	IQR: 21.81 us (863.57 to 885.39)
	115 measurements, 1000 runs per measurement, 1 thread

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7fea1fa030d0>
	payload()
	Median: 853.30 us
	IQR: 23.12 us (839.39 to 862.51)
	118 measurements, 1000 runs per measurement, 1 thread

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7fa910ece0d0>
	payload()
	Median: 944.96 us
	IQR: 21.73 us (932.26 to 953.98)
	106 measurements, 1000 runs per measurement, 1 thread

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7f985d0350d0>
	payload()
	Median: 824.83 us
	IQR: 22.66 us (810.02 to 832.68)
	122 measurements, 1000 runs per measurement, 1 thread

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7fec482f90d0>
	payload()
	Median: 882.59 us
	IQR: 23.45 us (872.44 to 895.89)
	114 measurements, 1000 runs per measurement, 1 thread

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7f06f98030d0>
	payload()
	Median: 965.52 us
	IQR: 7.39 us (960.24 to 967.63)
	104 measurements, 1000 runs per measurement, 1 thread

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7f6ffaf100d0>
	payload()
	Median: 935.27 us
	IQR: 7.91 us (929.42 to 937.32)
	108 measurements, 1000 runs per measurement, 1 thread

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7f4fc56c60d0>
	payload()
	Median: 852.49 us
	IQR: 16.90 us (844.33 to 861.22)
	118 measurements, 1000 runs per measurement, 1 thread





	PR:
	python benchmarks/profiler_benchmark/profiler_bench.py --use_timer

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7f0621be3310>
	payload()
	Median: 846.23 us
	IQR: 18.41 us (838.14 to 856.55)
	119 measurements, 1000 runs per measurement, 1 thread

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7f1dada4b310>
	payload()
	Median: 866.56 us
	IQR: 19.08 us (857.38 to 876.46)
	116 measurements, 1000 runs per measurement, 1 thread

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7f05c84b0310>
	payload()
	Median: 881.75 us
	IQR: 19.57 us (871.79 to 891.37)
	114 measurements, 1000 runs per measurement, 1 thread

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7f5b36b45310>
	payload()
	Median: 931.29 us
	IQR: 22.71 us (921.67 to 944.38)
	108 measurements, 1000 runs per measurement, 1 thread

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7ff6652ce310>
	payload()
	Median: 885.31 us
	IQR: 16.96 us (875.23 to 892.20)
	114 measurements, 1000 runs per measurement, 1 thread

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7ffa8819e310>
	payload()
	Median: 853.89 us
	IQR: 21.47 us (843.78 to 865.25)
	118 measurements, 1000 runs per measurement, 1 thread

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7fd80c581310>
	payload()
	Median: 891.49 us
	IQR: 21.51 us (881.16 to 902.67)
	113 measurements, 1000 runs per measurement, 1 thread

	Payload: loop; 256 iterations, N = 100

	Profiling disabled, tensor size 1x1, use cuda: False, with stacks: False, use script: False
	<torch.utils._benchmark.utils.common.Measurement object at 0x7f3a33525310>
	payload()
	Median: 876.65 us
	IQR: 19.72 us (865.53 to 885.26)
	115 measurements, 1000 runs per measurement, 1 thread