vkuzo/gist:11a7bed73fe60e340862d37e7975e9cd Secret

## gistfile1.txt
# after

(pytorch2) [vasiliy@devgpu108.ash6 ~/local/pytorch/benchmarks/operator_benchmark] python -m pt.quantization_test
# ----------------------------------------
# PyTorch/Caffe2 Operator Micro-benchmarks
# ----------------------------------------
# Tag : short

# Benchmarking PyTorch: FakeQuantize
# Mode: Eager
# Name: FakeQuantize_N1_C3_H512_W512_cpu
# Input: N: 1, C: 3, H: 512, W: 512, device: cpu
Forward Execution Time (us) : 934.408

# Benchmarking PyTorch: FakeQuantize
# Mode: Eager
# Name: FakeQuantize_N1_C3_H512_W512_cuda
# Input: N: 1, C: 3, H: 512, W: 512, device: cuda
Forward Execution Time (us) : 780.428

# Benchmarking PyTorch: FakeQuantize
# Mode: Eager
# Name: FakeQuantize_N1_C32_H1024_W1024_cpu
# Input: N: 1, C: 32, H: 1024, W: 1024, device: cpu
Forward Execution Time (us) : 36511.692

# Benchmarking PyTorch: FakeQuantize
# Mode: Eager
# Name: FakeQuantize_N1_C32_H1024_W1024_cuda
# Input: N: 1, C: 32, H: 1024, W: 1024, device: cuda
Forward Execution Time (us) : 1469.467

(pytorch2) [vasiliy@devgpu108.ash6 ~/local/pytorch/benchmarks/operator_benchmark]

# before

(pytorch2) [vasiliy@devgpu108.ash6 ~/local/pytorch/benchmarks/operator_benchmark] python -m pt.quantization_test
# ----------------------------------------
# PyTorch/Caffe2 Operator Micro-benchmarks
# ----------------------------------------
# Tag : short

# Benchmarking PyTorch: FakeQuantize
# Mode: Eager
# Name: FakeQuantize_N1_C3_H512_W512_cpu
# Input: N: 1, C: 3, H: 512, W: 512, device: cpu
Forward Execution Time (us) : 960.386

# Benchmarking PyTorch: FakeQuantize
# Mode: Eager
# Name: FakeQuantize_N1_C3_H512_W512_cuda
# Input: N: 1, C: 3, H: 512, W: 512, device: cuda
Forward Execution Time (us) : 727.115

# Benchmarking PyTorch: FakeQuantize
# Mode: Eager
# Name: FakeQuantize_N1_C32_H1024_W1024_cpu
# Input: N: 1, C: 32, H: 1024, W: 1024, device: cpu
Forward Execution Time (us) : 29881.734

# Benchmarking PyTorch: FakeQuantize
# Mode: Eager
# Name: FakeQuantize_N1_C32_H1024_W1024_cuda
# Input: N: 1, C: 32, H: 1024, W: 1024, device: cuda
Forward Execution Time (us) : 1526.125
	# after

	(pytorch2) [vasiliy@devgpu108.ash6 ~/local/pytorch/benchmarks/operator_benchmark] python -m pt.quantization_test
	# ----------------------------------------
	# PyTorch/Caffe2 Operator Micro-benchmarks
	# ----------------------------------------
	# Tag : short

	# Benchmarking PyTorch: FakeQuantize
	# Mode: Eager
	# Name: FakeQuantize_N1_C3_H512_W512_cpu
	# Input: N: 1, C: 3, H: 512, W: 512, device: cpu
	Forward Execution Time (us) : 934.408

	# Benchmarking PyTorch: FakeQuantize
	# Mode: Eager
	# Name: FakeQuantize_N1_C3_H512_W512_cuda
	# Input: N: 1, C: 3, H: 512, W: 512, device: cuda
	Forward Execution Time (us) : 780.428

	# Benchmarking PyTorch: FakeQuantize
	# Mode: Eager
	# Name: FakeQuantize_N1_C32_H1024_W1024_cpu
	# Input: N: 1, C: 32, H: 1024, W: 1024, device: cpu
	Forward Execution Time (us) : 36511.692

	# Benchmarking PyTorch: FakeQuantize
	# Mode: Eager
	# Name: FakeQuantize_N1_C32_H1024_W1024_cuda
	# Input: N: 1, C: 32, H: 1024, W: 1024, device: cuda
	Forward Execution Time (us) : 1469.467

	(pytorch2) [vasiliy@devgpu108.ash6 ~/local/pytorch/benchmarks/operator_benchmark]

	# before

	(pytorch2) [vasiliy@devgpu108.ash6 ~/local/pytorch/benchmarks/operator_benchmark] python -m pt.quantization_test
	# ----------------------------------------
	# PyTorch/Caffe2 Operator Micro-benchmarks
	# ----------------------------------------
	# Tag : short

	# Benchmarking PyTorch: FakeQuantize
	# Mode: Eager
	# Name: FakeQuantize_N1_C3_H512_W512_cpu
	# Input: N: 1, C: 3, H: 512, W: 512, device: cpu
	Forward Execution Time (us) : 960.386

	# Benchmarking PyTorch: FakeQuantize
	# Mode: Eager
	# Name: FakeQuantize_N1_C3_H512_W512_cuda
	# Input: N: 1, C: 3, H: 512, W: 512, device: cuda
	Forward Execution Time (us) : 727.115

	# Benchmarking PyTorch: FakeQuantize
	# Mode: Eager
	# Name: FakeQuantize_N1_C32_H1024_W1024_cpu
	# Input: N: 1, C: 32, H: 1024, W: 1024, device: cpu
	Forward Execution Time (us) : 29881.734

	# Benchmarking PyTorch: FakeQuantize
	# Mode: Eager
	# Name: FakeQuantize_N1_C32_H1024_W1024_cuda
	# Input: N: 1, C: 32, H: 1024, W: 1024, device: cuda
	Forward Execution Time (us) : 1526.125