Last active
September 30, 2020 12:30
-
-
Save dcslin/1ffeed1319c60381673e904848ce1e47 to your computer and use it in GitHub Desktop.
benchmark pytroch resnet18 cifar10 apex amp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Diff https://github.com/kuangliu/pytorch-cifar/blob/master/main.py | |
''' | |
from apex import amp | |
net, optimizer = amp.initialize(net, optimizer, opt_level=args.opt_level) | |
#if device == 'cuda': | |
# net = torch.nn.DataParallel(net) | |
# cudnn.benchmark = True | |
# loss.backward() | |
with amp.scale_loss(loss, optimizer) as scaled_loss: | |
scaled_loss.backward() |
O0 pure fp32 nvprof
Time(%) Time Calls Avg Min Max Name
10.12% 2.49372s 5865 425.19us 7.7450us 595.27us void transpose_readWrite_alignment_kernel<float2, float2, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams<float2>, float2 const *, float2*, float2 const *)
9.75% 2.40237s 3128 768.02us 241.49us 962.20us volta_cgemm_32x32_tn
8.06% 1.98524s 391 5.0773ms 3.3001ms 6.7060ms void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=4, int=6, int=3, int=2, int=4, bool=1, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad2d_alg1_1<float, int=0, int=4, int=6, int=3, int=2, int=4, bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int)
6.47% 1.59362s 7429 214.51us 123.85us 309.55us volta_sgemm_128x64_nt
6.24% 1.53810s 6874 223.76us 109.35us 307.67us volta_sgemm_128x64_nn
6.10% 1.50371s 3910 384.58us 24.898us 440.95us void fft2d_r2c_64x64<float>(float2*, float const *, int, int, int, int, int, int, int, int)
5.97% 1.47099s 11957 123.02us 7.2010us 239.73us void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
5.51% 1.35837s 11957 113.60us 32.353us 226.45us void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)
4.76% 1.17378s 103900 11.297us 928ns 185.90us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
4.49% 1.10594s 14994 73.759us 3.2960us 186.00us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_21threshold_kernel_implIfEEvRNS_14TensorIteratorET_S5_EUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
4.28% 1.05338s 5865 179.60us 40.451us 310.19us void cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>(float, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>, cudnnTensorStruct, float const *, float, float const , float, cudnnTensorStruct*, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const *, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>*, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const *, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const , cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const , cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>)
3.55% 873.68ms 3128 279.31us 41.602us 519.65us void flip_filter<float, float>(float*, float const *, int, int, int, int)
2.94% 724.88ms 2736 264.94us 162.83us 385.21us void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
2.63% 648.77ms 5860 110.71us 42.306us 196.40us void cudnn::detail::bn_fw_tr_1C11_kernel_NCHW<float, float, int=512, bool=1, int=1>(cudnnTensorStruct, float const *, cudnn::detail::bn_fw_tr_1C11_kernel_NCHW<float, float, int=512, bool=1, int=1>, cudnnTensorStruct*, float const *, float const , cudnnTensorStruct*, cudnnTensorStruct*, cudnnTensorStruct**, float const *, float const *, float const *, cudnnTensorStruct*, cudnnTensorStruct*)
2.15% 529.52ms 390 1.3577ms 1.3109ms 1.7981ms volta_gcgemm_32x32_nt
1.59% 392.37ms 11957 32.815us 2.7520us 124.01us void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)
1.46% 359.73ms 391 920.03us 581.03us 1.2001ms void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, int=5, bool=1, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, int=5, bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int)
1.40% 344.74ms 782 440.85us 272.18us 633.86us volta_scudnn_128x128_stridedB_splitK_small_nn_v1
1.38% 340.18ms 982 346.42us 258.13us 463.96us volta_scudnn_128x128_relu_small_nn_v1
1.34% 329.08ms 1563 210.55us 192.17us 262.54us void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
1.19% 294.09ms 782 376.08us 102.28us 798.29us void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=7, int=3, int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=7, int=3, int=3, int=5, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, int, int, int)
1.00% 245.49ms 1955 125.57us 13.665us 174.57us void fft2d_c2r_64x64<float, bool=0>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*)
0.82% 201.46ms 2346 85.872us 40.515us 165.32us void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)
0.80% 196.69ms 491 400.59us 379.83us 528.35us volta_scudnn_128x64_relu_small_nn_v1
0.67% 165.03ms 782 211.04us 112.17us 349.04us volta_scudnn_128x128_stridedB_splitK_interior_nn_v1
0.66% 163.46ms 390 419.13us 409.72us 461.72us void fft2d_r2c_32x32<float, bool=1, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
0.56% 137.67ms 1110 124.03us 992ns 5.0508ms [CUDA memcpy HtoD]
0.48% 117.92ms 2346 50.265us 25.889us 67.044us void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
0.46% 112.27ms 2346 47.854us 19.105us 69.700us void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)
0.43% 105.99ms 2000 52.994us 24.802us 99.589us void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1>(float, cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1>, cudnnTensorStruct, float const *, float, cudnnTensorStruct*, float, cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const *, cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const , cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const , cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const , cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1>)
0.40% 97.487ms 24180 4.0310us 1.0560us 36.738us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUlfE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
0.36% 89.534ms 783 114.35us 65.220us 395.77us void cudnn::detail::dgrad_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1>*, kernel_grad_params, int, int, float, int, int, int)
0.31% 75.861ms 3520 21.551us 1.3440us 62.308us void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
0.26% 63.671ms 24962 2.5500us 768ns 17.857us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
0.24% 57.943ms 1955 29.638us 17.633us 33.506us void cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>(float, float, float, float, cudnnTensorStruct, float const *, cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>, float const , cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>, cudnnTensorStruct*, float const *, float*, float const *, float const , float const , float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)
0.23% 56.392ms 982 57.425us 44.866us 75.812us volta_scudnn_128x64_relu_interior_nn_v1
0.21% 50.851ms 391 130.05us 103.59us 153.00us void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>*, kernel_grad_params, int, int, float, int, int, int)
0.18% 43.979ms 1960 22.438us 14.017us 26.562us void cudnn::detail::bn_fw_tr_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>(cudnnTensorStruct, float const *, cudnn::detail::bn_fw_tr_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>, cudnnTensorStruct*, float const *, float const , float, float, float*, float const *, float const *, float const *, float, float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)
0.14% 35.562ms 491 72.427us 55.299us 86.149us volta_scudnn_128x128_relu_interior_nn_v1
0.06% 14.862ms 7820 1.9000us 1.4720us 9.4400us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE4_clEvEUlllE_EEvS4_RKT_EUllE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
0.06% 13.896ms 391 35.540us 23.233us 45.667us void at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_backward_out_cuda_frame<float, float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_backward_out_cuda_frame<float, float>*, int, bool, bool)
0.04% 10.541ms 1104 9.5470us 1.2160us 2.0619ms [CUDA memcpy DtoH]
0.04% 9.0982ms 3128 2.9080us 1.6000us 7.8080us compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
0.02% 6.1390ms 2745 2.2360us 960ns 7.6490us [CUDA memset]
0.02% 6.1029ms 491 12.429us 11.648us 16.577us void at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_out_cuda_frame<float, float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_out_cuda_frame<float, float>*, int, bool, bool)
0.02% 5.6197ms 2946 1.9070us 1.5040us 8.8970us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.02% 4.9938ms 1564 3.1920us 1.2800us 7.0080us cudnn::gemm::computeWgradBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
0.02% 3.8378ms 491 7.8160us 7.4240us 9.6320us volta_sgemm_32x32_sliced1x4_tn
0.01% 3.3765ms 391 8.6350us 7.0090us 11.232us _ZN2at6native13reduce_kernelILi256ELi2ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
0.01% 3.0405ms 391 7.7760us 7.4560us 10.240us volta_sgemm_32x32_sliced1x4_nt
0.01% 2.8789ms 1564 1.8400us 1.3760us 5.5040us cudnn::gemm::computeWgradSplitKOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.01% 2.6288ms 491 5.3530us 4.5760us 6.7520us void at::native::reduce_kernel<int=512, int=1, at::native::ReduceOp<float, at::native::MaxOps<float>, unsigned int, float, int=4>>(float)
0.01% 2.1970ms 491 4.4740us 4.1920us 5.4410us _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIlNS0_14func_wrapper_tIlZNS0_15sum_kernel_implIlllEEvRNS_14TensorIteratorEEUlllE_EEjlLi4EEEEEvT1_
0.01% 2.1036ms 491 4.2840us 4.0000us 6.6890us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjESC_NS0_6memory15LoadWithoutCastENSD_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
0.01% 2.0338ms 491 4.1420us 3.5520us 11.201us void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)
0.01% 1.9302ms 391 4.9360us 4.7360us 6.1130us volta_sgemm_128x32_nn
0.01% 1.7138ms 491 3.4900us 2.9440us 4.2560us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE4_clEvEUllE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
0.01% 1.4417ms 491 2.9360us 2.4960us 5.8880us void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const *, float const *, float*, float const *, float const *)
0.00% 1.1803ms 391 3.0180us 2.4000us 7.3930us void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)
0.00% 1.0792ms 491 2.1980us 1.7280us 5.3440us void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_forward<float, float, float, int=4, bool=1>(float*, float const *, int, int, int)
0.00% 859.31us 491 1.7500us 1.2800us 3.2960us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14eq_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE4_clEvENKUlvE_clEvEUlllE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.00% 846.99us 391 2.1660us 1.4400us 5.2480us void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_backward<float, float, float, int=4, bool=1>(float*, float const *, float const , int, int, int)
0.00% 218.67us 62 3.5260us 1.1840us 36.354us [CUDA memcpy DtoD]
shape of 1 batch
inputs shape torch.Size([128, 3, 32, 32])
targets shape torch.Size([128])
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
kernel meanings
https://www.adityaagrawal.net/blog/dnn/resnet50
O3 pure fp16 nvprof