Last active
September 30, 2020 12:30
-
-
Save dcslin/1ffeed1319c60381673e904848ce1e47 to your computer and use it in GitHub Desktop.
benchmark pytroch resnet18 cifar10 apex amp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Diff https://github.com/kuangliu/pytorch-cifar/blob/master/main.py | |
''' | |
from apex import amp | |
net, optimizer = amp.initialize(net, optimizer, opt_level=args.opt_level) | |
#if device == 'cuda': | |
# net = torch.nn.DataParallel(net) | |
# cudnn.benchmark = True | |
# loss.backward() | |
with amp.scale_loss(loss, optimizer) as scaled_loss: | |
scaled_loss.backward() |
kernel meanings
https://www.adityaagrawal.net/blog/dnn/resnet50
O3 pure fp16 nvprof
Time(%) Time Calls Avg Min Max Name
17.94% 2.20808s 7038 313.74us 29.378us 1.0335ms turing_s1688cudnn_fp16_128x128_ldg8_wgrad_idx_exp_interior_nhwc_nt_v1
9.65% 1.18803s 3516 337.89us 260.88us 472.99us volta_fp16_s884cudnn_fp16_128x128_ldg8_dgrad_f2f_exp_small_nhwc2nchw_tt_v1
8.96% 1.10349s 43864 25.157us 2.1120us 67.908us void nchwToNhwcKernel<__half, __half, float, bool=1, bool=0>(int, int, int, int, __half const *, __half*, float, float)
6.73% 829.13ms 7820 106.03us 22.433us 302.51us void at::native::batch_norm_backward_kernel<c10::Half, c10::Half, float, int>(at::GenericPackedTensorAccessor<c10::Half, unsigned long=3, at::DefaultPtrTraits, int>, int, int, at::native::batch_norm_backward_kernel<c10::Half<c10::Half, unsigned long=1, c10::Half, at::DefaultPtrTraits>, c10::Half, float, int>, c10::Half, c10::Half, c10::Half, c10::Half, at::native::batch_norm_backward_kernel<c10::Half<float, unsigned long=1, c10::Half, at::DefaultPtrTraits>, c10::Half, float, int>, float, bool, at::native::batch_norm_backward_kernel<c10::Half<c10::Half, unsigned long=1, c10::Half, at::DefaultPtrTraits>, c10::Half, float, int>)
6.48% 797.55ms 3430 232.52us 88.901us 444.86us turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_f2f_exp_small_nhwc_tn_v1
6.31% 776.60ms 1964 395.42us 297.20us 540.77us volta_gcgemm_32x32_nt
5.05% 621.51ms 103900 5.9810us 928ns 94.533us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE8_clEvEUlNS4_4HalfES8_E_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
4.88% 601.38ms 1564 384.51us 243.73us 513.69us volta_fp16_s884cudnn_fp16_256x64_ldg8_dgrad_f2f_exp_small_nhwc2nchw_tt_v1
4.49% 553.35ms 14994 36.904us 2.6240us 94.853us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_21threshold_kernel_implIN3c104HalfEEEvRNS_14TensorIteratorET_S7_EUlS4_S4_E_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
3.64% 448.65ms 1964 228.44us 151.08us 308.34us turing_fp16_s1688cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1
3.14% 386.23ms 7820 49.389us 13.409us 144.39us void at::native::batch_norm_collect_statistics_kernel<at::native::InvStd, c10::Half, c10::Half, float, int>(at::GenericPackedTensorAccessor<c10::Half, unsigned long=3, at::RestrictPtrTraits, int>, float, at::GenericPackedTensorAccessor<c10::Half, unsigned long=3, at::RestrictPtrTraits, int>, at::native::batch_norm_collect_statistics_kernel<at::native::InvStd<c10::Half, unsigned long=1, c10::Half, at::RestrictPtrTraits>, c10::Half, c10::Half, float, int>, c10::Half, at::native::batch_norm_collect_statistics_kernel<at::native::InvStd<at::GenericPackedTensorAccessor<c10::Half, unsigned long=3, at::RestrictPtrTraits, int>, unsigned long=1, c10::Half, at::RestrictPtrTraits>, c10::Half, c10::Half, float, int>, at::native::batch_norm_collect_statistics_kernel<at::native::InvStd<c10::Half, unsigned long=1, c10::Half, at::RestrictPtrTraits>, c10::Half, c10::Half, float, int>)
3.07% 378.41ms 1964 192.67us 111.59us 237.71us void fft2d_c2r_32x32<__half, bool=1, bool=0, unsigned int=0, bool=0, bool=0>(__half*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, __half*, __half*, int2, int, int)
2.57% 316.63ms 10693 29.611us 7.2960us 65.636us void nhwcToNchwKernel<__half, __half, float, bool=1, bool=0>(int, int, int, int, __half const *, __half*, float, float)
2.39% 294.13ms 1971 149.23us 87.653us 232.97us turing_fp16_s1688cudnn_fp16_128x128_ldg8_relu_f2f_exp_small_nhwc_tn_v1
2.30% 282.62ms 7820 36.140us 13.184us 80.965us void at::native::batch_norm_transform_input_kernel<c10::Half, c10::Half, float, bool=1, int>(at::GenericPackedTensorAccessor<c10::Half, unsigned long=3, at::RestrictPtrTraits, int>, int, at::native::batch_norm_transform_input_kernel<c10::Half<std::conditional<bool=1, float, c10::Half>::type, unsigned long=1, c10::Half, at::RestrictPtrTraits>, c10::Half, float, bool=1, int>, std::conditional<bool=1, float, c10::Half>::type, at::native::batch_norm_transform_input_kernel<c10::Half<float, unsigned long=1, c10::Half, at::RestrictPtrTraits>, c10::Half, float, bool=1, int>, at::native::batch_norm_transform_input_kernel<c10::Half<std::conditional<bool=1, float, c10::Half>::type, unsigned long=1, c10::Half, at::RestrictPtrTraits>, c10::Half, float, bool=1, int>, std::conditional)
2.10% 259.04ms 391 662.51us 463.16us 890.83us void cudnn::detail::wgrad_alg0_engine<__half, int=128, int=6, int=7, int=3, int=3, int=5, bool=1, int=512>(int, int, int, __half const *, int, cudnn::detail::wgrad_alg0_engine<__half, int=128, int=6, int=7, int=3, int=3, int=5, bool=1, int=512>*, __half const , kernel_grad_params, int, float, int, int, int, int)
2.09% 257.44ms 1964 131.08us 71.780us 204.27us void fft2d_r2c_32x32<__half, bool=0, unsigned int=0, bool=0>(float2*, __half const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
1.67% 205.77ms 1173 175.42us 112.17us 241.20us void dgrad_1d<int=9>(unsigned short*, unsigned short*, unsigned short*, int, int, int, int, int, cudnn::reduced_divisor, cudnn::reduced_divisor, int, int, int)
1.09% 133.68ms 1110 120.43us 992ns 5.2825ms [CUDA memcpy HtoD]
0.73% 90.245ms 7038 12.822us 3.6480us 50.851us void nhwcToNchwKernel<float, __half, float, bool=1, bool=0>(int, int, int, int, float const *, __half*, float, float)
0.51% 62.685ms 24180 2.5920us 1.0240us 18.625us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE8_clEvEUlN3c104HalfES8_E_EEvS4_RKT_EUlS8_E0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
0.51% 62.681ms 2000 31.340us 17.633us 56.163us void at::native::batch_norm_transform_input_kernel<c10::Half, c10::Half, float, bool=0, int>(at::GenericPackedTensorAccessor<c10::Half, unsigned long=3, at::RestrictPtrTraits, int>, int, at::native::batch_norm_transform_input_kernel<c10::Half<std::conditional<bool=0, float, c10::Half>::type, unsigned long=1, c10::Half, at::RestrictPtrTraits>, c10::Half, float, bool=0, int>, std::conditional<bool=0, float, c10::Half>::type, at::native::batch_norm_transform_input_kernel<c10::Half<float, unsigned long=1, c10::Half, at::RestrictPtrTraits>, c10::Half, float, bool=0, int>, at::native::batch_norm_transform_input_kernel<c10::Half<std::conditional<bool=0, float, c10::Half>::type, unsigned long=1, c10::Half, at::RestrictPtrTraits>, c10::Half, float, bool=0, int>, std::conditional)
0.40% 49.394ms 491 100.60us 93.573us 123.62us void fft2d_r2c_32x32<__half, bool=0, unsigned int=1, bool=1>(float2*, __half const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
0.40% 48.773ms 1173 41.579us 29.153us 65.955us dgrad_1x1_stride_2x2
0.37% 45.115ms 9392 4.8030us 1.0240us 32.994us [CUDA memset]
0.34% 41.259ms 24571 1.6790us 736ns 9.6970us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE9_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
0.27% 33.794ms 7038 4.8010us 1.0880us 17.601us void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
0.25% 30.186ms 490 61.603us 47.523us 77.509us volta_fp16_scudnn_fp16_128x64_relu_interior_nn_v1
0.24% 29.978ms 491 61.055us 42.978us 88.773us volta_fp16_scudnn_fp16_128x64_relu_small_nn_v1
0.20% 24.329ms 14412 1.6880us 1.1200us 6.3040us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.17% 21.177ms 391 54.161us 49.218us 71.524us void cask_cudnn::first_layer_wgrad_kernel<int=3, int=3, int=1, int=1, int=64>(FirstLayerWgradParams)
0.14% 17.611ms 981 17.952us 16.289us 24.226us turing_fp16_s1688cudnn_fp16_128x128_ldg8_relu_f2f_exp_interior_nhwc_tn_v1
0.11% 14.097ms 7820 1.8020us 1.2160us 8.8010us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE4_clEvEUlllE_EEvS4_RKT_EUllE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
0.11% 13.517ms 7038 1.9200us 1.5040us 6.4960us cudnn::gemm::computeWgradOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.10% 12.871ms 391 32.917us 21.697us 44.962us void at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_backward_out_cuda_frame<c10::Half, float>(int, c10::Half const *, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_backward_out_cuda_frame<c10::Half, float>*, int, bool, bool)
0.09% 10.655ms 1104 9.6510us 1.2160us 2.3272ms [CUDA memcpy DtoH]
0.07% 8.2833ms 391 21.185us 14.337us 27.842us void nchwToNhwc3To4Kernel<__half, __half, float, bool=1>(int, int, int, int, __half const *, __half*, float, float)
0.06% 6.9803ms 5083 1.3730us 1.0240us 8.4810us cudnn::gemm::computeBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
0.05% 6.7086ms 984 6.8170us 1.6320us 32.354us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE8_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESE_NS0_6memory12LoadWithCastILi1EEENSF_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
0.03% 3.8133ms 491 7.7660us 7.4250us 10.369us void at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_out_cuda_frame<c10::Half, float>(int, c10::Half const *, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_out_cuda_frame<c10::Half, float>*, int, bool, bool)
0.03% 3.5869ms 491 7.3050us 7.1680us 9.4410us volta_sgemm_fp16_32x32_sliced1x4_tn
0.03% 3.4619ms 391 8.8530us 6.8490us 11.873us _ZN2at6native13reduce_kernelILi256ELi2ENS0_8ReduceOpIN3c104HalfENS0_14func_wrapper_tIS4_ZNS0_15sum_kernel_implIS4_fS4_EEvRNS_14TensorIteratorEEUlffE_EEjS4_Li4EEEEEvT1_
0.03% 3.1815ms 391 8.1360us 7.6160us 10.529us volta_fp16_sgemm_fp16_32x32_sliced1x4_nt
0.02% 2.8242ms 593 4.7620us 1.1840us 29.634us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
0.02% 2.5423ms 491 5.1770us 4.5120us 6.6560us void at::native::reduce_kernel<int=512, int=1, at::native::ReduceOp<float, at::native::MaxOps<float>, unsigned int, float, int=4>>(float)
0.02% 2.2364ms 491 4.5540us 4.0000us 5.8240us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE8_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjESE_NS0_6memory15LoadWithoutCastENSF_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
0.02% 2.1866ms 491 4.4530us 4.1600us 5.7600us _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIlNS0_14func_wrapper_tIlZNS0_15sum_kernel_implIlllEEvRNS_14TensorIteratorEEUlllE_EEjlLi4EEEEEvT1_
0.02% 1.9457ms 391 4.9760us 4.8640us 6.7210us generic4To3Channel_kernel(cudnnTensorStruct, __half const *, cudnnTensorStruct, __half*)
0.02% 1.9371ms 491 3.9450us 3.4880us 7.2650us void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)
0.02% 1.9363ms 391 4.9520us 4.8320us 10.625us volta_fp16_sgemm_fp16_32x128_nn
0.02% 1.9315ms 3 643.84us 642.95us 644.58us volta_fp16_s884cudnn_fp16_256x128_ldg8_dgrad_f2f_exp_small_nhwc2nchw_tt_v1
0.01% 1.6984ms 491 3.4580us 2.8160us 4.2240us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE4_clEvEUllE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
0.01% 1.3932ms 491 2.8370us 2.7520us 3.5840us void splitKreduce_kernel<float, __half, float>(cublasSplitKParams<float>, float const *, __half const *, __half*, float const *, float const *)
0.01% 1.1027ms 391 2.8200us 2.2720us 3.5520us void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)
0.01% 926.36us 491 1.8860us 1.6960us 3.4240us void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_forward<float, float, float, int=4, bool=1>(float*, float const *, int, int, int)
0.01% 777.16us 491 1.5820us 1.2800us 3.4890us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14eq_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE4_clEvENKUlvE_clEvEUlllE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.01% 637.23us 391 1.6290us 1.3760us 3.1680us void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_backward<float, float, float, int=4, bool=1>(float*, float const *, float const , int, int, int)
0.00% 496.38us 391 1.2690us 960ns 3.2960us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
0.00% 144.52us 62 2.3300us 1.1840us 19.521us [CUDA memcpy DtoD]
0.00% 43.234us 1 43.234us 43.234us 43.234us volta_fp16_scudnn_fp16_128x128_relu_interior_nn_v1
0.00% 26.881us 1 26.881us 26.881us 26.881us turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_f2f_exp_interior_nhwc_tn_v1
O0 pure fp32 nvprof
Time(%) Time Calls Avg Min Max Name
10.12% 2.49372s 5865 425.19us 7.7450us 595.27us void transpose_readWrite_alignment_kernel<float2, float2, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams<float2>, float2 const *, float2*, float2 const *)
9.75% 2.40237s 3128 768.02us 241.49us 962.20us volta_cgemm_32x32_tn
8.06% 1.98524s 391 5.0773ms 3.3001ms 6.7060ms void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=4, int=6, int=3, int=2, int=4, bool=1, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad2d_alg1_1<float, int=0, int=4, int=6, int=3, int=2, int=4, bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int)
6.47% 1.59362s 7429 214.51us 123.85us 309.55us volta_sgemm_128x64_nt
6.24% 1.53810s 6874 223.76us 109.35us 307.67us volta_sgemm_128x64_nn
6.10% 1.50371s 3910 384.58us 24.898us 440.95us void fft2d_r2c_64x64<float>(float2*, float const *, int, int, int, int, int, int, int, int)
5.97% 1.47099s 11957 123.02us 7.2010us 239.73us void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
5.51% 1.35837s 11957 113.60us 32.353us 226.45us void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)
4.76% 1.17378s 103900 11.297us 928ns 185.90us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
4.49% 1.10594s 14994 73.759us 3.2960us 186.00us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_21threshold_kernel_implIfEEvRNS_14TensorIteratorET_S5_EUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
4.28% 1.05338s 5865 179.60us 40.451us 310.19us void cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>(float, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>, cudnnTensorStruct, float const *, float, float const , float, cudnnTensorStruct*, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const *, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>*, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const *, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const , cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const , cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>)
3.55% 873.68ms 3128 279.31us 41.602us 519.65us void flip_filter<float, float>(float*, float const *, int, int, int, int)
2.94% 724.88ms 2736 264.94us 162.83us 385.21us void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
2.63% 648.77ms 5860 110.71us 42.306us 196.40us void cudnn::detail::bn_fw_tr_1C11_kernel_NCHW<float, float, int=512, bool=1, int=1>(cudnnTensorStruct, float const *, cudnn::detail::bn_fw_tr_1C11_kernel_NCHW<float, float, int=512, bool=1, int=1>, cudnnTensorStruct*, float const *, float const , cudnnTensorStruct*, cudnnTensorStruct*, cudnnTensorStruct**, float const *, float const *, float const *, cudnnTensorStruct*, cudnnTensorStruct*)
2.15% 529.52ms 390 1.3577ms 1.3109ms 1.7981ms volta_gcgemm_32x32_nt
1.59% 392.37ms 11957 32.815us 2.7520us 124.01us void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)
1.46% 359.73ms 391 920.03us 581.03us 1.2001ms void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, int=5, bool=1, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, int=5, bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int)
1.40% 344.74ms 782 440.85us 272.18us 633.86us volta_scudnn_128x128_stridedB_splitK_small_nn_v1
1.38% 340.18ms 982 346.42us 258.13us 463.96us volta_scudnn_128x128_relu_small_nn_v1
1.34% 329.08ms 1563 210.55us 192.17us 262.54us void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
1.19% 294.09ms 782 376.08us 102.28us 798.29us void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=7, int=3, int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=7, int=3, int=3, int=5, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, int, int, int)
1.00% 245.49ms 1955 125.57us 13.665us 174.57us void fft2d_c2r_64x64<float, bool=0>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*)
0.82% 201.46ms 2346 85.872us 40.515us 165.32us void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)
0.80% 196.69ms 491 400.59us 379.83us 528.35us volta_scudnn_128x64_relu_small_nn_v1
0.67% 165.03ms 782 211.04us 112.17us 349.04us volta_scudnn_128x128_stridedB_splitK_interior_nn_v1
0.66% 163.46ms 390 419.13us 409.72us 461.72us void fft2d_r2c_32x32<float, bool=1, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
0.56% 137.67ms 1110 124.03us 992ns 5.0508ms [CUDA memcpy HtoD]
0.48% 117.92ms 2346 50.265us 25.889us 67.044us void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
0.46% 112.27ms 2346 47.854us 19.105us 69.700us void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)
0.43% 105.99ms 2000 52.994us 24.802us 99.589us void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1>(float, cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1>, cudnnTensorStruct, float const *, float, cudnnTensorStruct*, float, cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const *, cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const , cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const , cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const , cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1>)
0.40% 97.487ms 24180 4.0310us 1.0560us 36.738us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUlfE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
0.36% 89.534ms 783 114.35us 65.220us 395.77us void cudnn::detail::dgrad_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1>*, kernel_grad_params, int, int, float, int, int, int)
0.31% 75.861ms 3520 21.551us 1.3440us 62.308us void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
0.26% 63.671ms 24962 2.5500us 768ns 17.857us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
0.24% 57.943ms 1955 29.638us 17.633us 33.506us void cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>(float, float, float, float, cudnnTensorStruct, float const *, cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>, float const , cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>, cudnnTensorStruct*, float const *, float*, float const *, float const , float const , float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)
0.23% 56.392ms 982 57.425us 44.866us 75.812us volta_scudnn_128x64_relu_interior_nn_v1
0.21% 50.851ms 391 130.05us 103.59us 153.00us void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>*, kernel_grad_params, int, int, float, int, int, int)
0.18% 43.979ms 1960 22.438us 14.017us 26.562us void cudnn::detail::bn_fw_tr_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>(cudnnTensorStruct, float const *, cudnn::detail::bn_fw_tr_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>, cudnnTensorStruct*, float const *, float const , float, float, float*, float const *, float const *, float const *, float, float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)
0.14% 35.562ms 491 72.427us 55.299us 86.149us volta_scudnn_128x128_relu_interior_nn_v1
0.06% 14.862ms 7820 1.9000us 1.4720us 9.4400us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE4_clEvEUlllE_EEvS4_RKT_EUllE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
0.06% 13.896ms 391 35.540us 23.233us 45.667us void at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_backward_out_cuda_frame<float, float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_backward_out_cuda_frame<float, float>*, int, bool, bool)
0.04% 10.541ms 1104 9.5470us 1.2160us 2.0619ms [CUDA memcpy DtoH]
0.04% 9.0982ms 3128 2.9080us 1.6000us 7.8080us compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
0.02% 6.1390ms 2745 2.2360us 960ns 7.6490us [CUDA memset]
0.02% 6.1029ms 491 12.429us 11.648us 16.577us void at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_out_cuda_frame<float, float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_out_cuda_frame<float, float>*, int, bool, bool)
0.02% 5.6197ms 2946 1.9070us 1.5040us 8.8970us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.02% 4.9938ms 1564 3.1920us 1.2800us 7.0080us cudnn::gemm::computeWgradBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
0.02% 3.8378ms 491 7.8160us 7.4240us 9.6320us volta_sgemm_32x32_sliced1x4_tn
0.01% 3.3765ms 391 8.6350us 7.0090us 11.232us _ZN2at6native13reduce_kernelILi256ELi2ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
0.01% 3.0405ms 391 7.7760us 7.4560us 10.240us volta_sgemm_32x32_sliced1x4_nt
0.01% 2.8789ms 1564 1.8400us 1.3760us 5.5040us cudnn::gemm::computeWgradSplitKOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.01% 2.6288ms 491 5.3530us 4.5760us 6.7520us void at::native::reduce_kernel<int=512, int=1, at::native::ReduceOp<float, at::native::MaxOps<float>, unsigned int, float, int=4>>(float)
0.01% 2.1970ms 491 4.4740us 4.1920us 5.4410us _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIlNS0_14func_wrapper_tIlZNS0_15sum_kernel_implIlllEEvRNS_14TensorIteratorEEUlllE_EEjlLi4EEEEEvT1_
0.01% 2.1036ms 491 4.2840us 4.0000us 6.6890us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjESC_NS0_6memory15LoadWithoutCastENSD_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
0.01% 2.0338ms 491 4.1420us 3.5520us 11.201us void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)
0.01% 1.9302ms 391 4.9360us 4.7360us 6.1130us volta_sgemm_128x32_nn
0.01% 1.7138ms 491 3.4900us 2.9440us 4.2560us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE4_clEvEUllE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
0.01% 1.4417ms 491 2.9360us 2.4960us 5.8880us void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const *, float const *, float*, float const *, float const *)
0.00% 1.1803ms 391 3.0180us 2.4000us 7.3930us void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)
0.00% 1.0792ms 491 2.1980us 1.7280us 5.3440us void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_forward<float, float, float, int=4, bool=1>(float*, float const *, int, int, int)
0.00% 859.31us 491 1.7500us 1.2800us 3.2960us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14eq_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE4_clEvENKUlvE_clEvEUlllE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.00% 846.99us 391 2.1660us 1.4400us 5.2480us void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_backward<float, float, float, int=4, bool=1>(float*, float const *, float const , int, int, int)
0.00% 218.67us 62 3.5260us 1.1840us 36.354us [CUDA memcpy DtoD]
shape of 1 batch
inputs shape torch.Size([128, 3, 32, 32])
targets shape torch.Size([128])
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
1 epoch used %.f sec: