Skip to content

Instantly share code, notes, and snippets.

@dcslin
Last active September 30, 2020 12:30
Show Gist options
  • Save dcslin/1ffeed1319c60381673e904848ce1e47 to your computer and use it in GitHub Desktop.
Save dcslin/1ffeed1319c60381673e904848ce1e47 to your computer and use it in GitHub Desktop.
benchmark pytroch resnet18 cifar10 apex amp
'''
Diff https://github.com/kuangliu/pytorch-cifar/blob/master/main.py
'''
from apex import amp
net, optimizer = amp.initialize(net, optimizer, opt_level=args.opt_level)
#if device == 'cuda':
# net = torch.nn.DataParallel(net)
# cudnn.benchmark = True
# loss.backward()
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
@dcslin
Copy link
Author

dcslin commented Sep 29, 2020

O0 pure fp32 nvprof

Time(%)      Time     Calls       Avg       Min       Max  Name
 10.12%  2.49372s      5865  425.19us  7.7450us  595.27us  void transpose_readWrite_alignment_kernel<float2, float2, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams<float2>, float2 const *, float2*, float2 const *)
  9.75%  2.40237s      3128  768.02us  241.49us  962.20us  volta_cgemm_32x32_tn
  8.06%  1.98524s       391  5.0773ms  3.3001ms  6.7060ms  void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=4, int=6, int=3, int=2, int=4, bool=1, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad2d_alg1_1<float, int=0, int=4, int=6, int=3, int=2, int=4, bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int)
  6.47%  1.59362s      7429  214.51us  123.85us  309.55us  volta_sgemm_128x64_nt
  6.24%  1.53810s      6874  223.76us  109.35us  307.67us  volta_sgemm_128x64_nn
  6.10%  1.50371s      3910  384.58us  24.898us  440.95us  void fft2d_r2c_64x64<float>(float2*, float const *, int, int, int, int, int, int, int, int)
  5.97%  1.47099s     11957  123.02us  7.2010us  239.73us  void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
  5.51%  1.35837s     11957  113.60us  32.353us  226.45us  void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)
  4.76%  1.17378s    103900  11.297us     928ns  185.90us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  4.49%  1.10594s     14994  73.759us  3.2960us  186.00us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_21threshold_kernel_implIfEEvRNS_14TensorIteratorET_S5_EUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  4.28%  1.05338s      5865  179.60us  40.451us  310.19us  void cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>(float, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>, cudnnTensorStruct, float const *, float, float const , float, cudnnTensorStruct*, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const *, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>*, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const *, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const , cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const , cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>)
  3.55%  873.68ms      3128  279.31us  41.602us  519.65us  void flip_filter<float, float>(float*, float const *, int, int, int, int)
  2.94%  724.88ms      2736  264.94us  162.83us  385.21us  void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  2.63%  648.77ms      5860  110.71us  42.306us  196.40us  void cudnn::detail::bn_fw_tr_1C11_kernel_NCHW<float, float, int=512, bool=1, int=1>(cudnnTensorStruct, float const *, cudnn::detail::bn_fw_tr_1C11_kernel_NCHW<float, float, int=512, bool=1, int=1>, cudnnTensorStruct*, float const *, float const , cudnnTensorStruct*, cudnnTensorStruct*, cudnnTensorStruct**, float const *, float const *, float const *, cudnnTensorStruct*, cudnnTensorStruct*)
  2.15%  529.52ms       390  1.3577ms  1.3109ms  1.7981ms  volta_gcgemm_32x32_nt
  1.59%  392.37ms     11957  32.815us  2.7520us  124.01us  void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)
  1.46%  359.73ms       391  920.03us  581.03us  1.2001ms  void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, int=5, bool=1, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, int=5, bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int)
  1.40%  344.74ms       782  440.85us  272.18us  633.86us  volta_scudnn_128x128_stridedB_splitK_small_nn_v1
  1.38%  340.18ms       982  346.42us  258.13us  463.96us  volta_scudnn_128x128_relu_small_nn_v1
  1.34%  329.08ms      1563  210.55us  192.17us  262.54us  void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
  1.19%  294.09ms       782  376.08us  102.28us  798.29us  void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=7, int=3, int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=7, int=3, int=3, int=5, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, int, int, int)
  1.00%  245.49ms      1955  125.57us  13.665us  174.57us  void fft2d_c2r_64x64<float, bool=0>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*)
  0.82%  201.46ms      2346  85.872us  40.515us  165.32us  void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)
  0.80%  196.69ms       491  400.59us  379.83us  528.35us  volta_scudnn_128x64_relu_small_nn_v1
  0.67%  165.03ms       782  211.04us  112.17us  349.04us  volta_scudnn_128x128_stridedB_splitK_interior_nn_v1
  0.66%  163.46ms       390  419.13us  409.72us  461.72us  void fft2d_r2c_32x32<float, bool=1, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  0.56%  137.67ms      1110  124.03us     992ns  5.0508ms  [CUDA memcpy HtoD]
  0.48%  117.92ms      2346  50.265us  25.889us  67.044us  void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
  0.46%  112.27ms      2346  47.854us  19.105us  69.700us  void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)
  0.43%  105.99ms      2000  52.994us  24.802us  99.589us  void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1>(float, cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1>, cudnnTensorStruct, float const *, float, cudnnTensorStruct*, float, cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const *, cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const , cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const , cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const , cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1>)
  0.40%  97.487ms     24180  4.0310us  1.0560us  36.738us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUlfE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.36%  89.534ms       783  114.35us  65.220us  395.77us  void cudnn::detail::dgrad_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1>*, kernel_grad_params, int, int, float, int, int, int)
  0.31%  75.861ms      3520  21.551us  1.3440us  62.308us  void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
  0.26%  63.671ms     24962  2.5500us     768ns  17.857us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
  0.24%  57.943ms      1955  29.638us  17.633us  33.506us  void cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>(float, float, float, float, cudnnTensorStruct, float const *, cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>, float const , cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>, cudnnTensorStruct*, float const *, float*, float const *, float const , float const , float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)
  0.23%  56.392ms       982  57.425us  44.866us  75.812us  volta_scudnn_128x64_relu_interior_nn_v1
  0.21%  50.851ms       391  130.05us  103.59us  153.00us  void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>*, kernel_grad_params, int, int, float, int, int, int)
  0.18%  43.979ms      1960  22.438us  14.017us  26.562us  void cudnn::detail::bn_fw_tr_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>(cudnnTensorStruct, float const *, cudnn::detail::bn_fw_tr_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>, cudnnTensorStruct*, float const *, float const , float, float, float*, float const *, float const *, float const *, float, float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)
  0.14%  35.562ms       491  72.427us  55.299us  86.149us  volta_scudnn_128x128_relu_interior_nn_v1
  0.06%  14.862ms      7820  1.9000us  1.4720us  9.4400us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE4_clEvEUlllE_EEvS4_RKT_EUllE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.06%  13.896ms       391  35.540us  23.233us  45.667us  void at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_backward_out_cuda_frame<float, float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_backward_out_cuda_frame<float, float>*, int, bool, bool)
  0.04%  10.541ms      1104  9.5470us  1.2160us  2.0619ms  [CUDA memcpy DtoH]
  0.04%  9.0982ms      3128  2.9080us  1.6000us  7.8080us  compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
  0.02%  6.1390ms      2745  2.2360us     960ns  7.6490us  [CUDA memset]
  0.02%  6.1029ms       491  12.429us  11.648us  16.577us  void at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_out_cuda_frame<float, float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__60_tmpxft_00000731_00000000_11_AveragePool2d_compute_75_cpp1_ii_3bc0c910::avg_pool2d_out_cuda_frame<float, float>*, int, bool, bool)
  0.02%  5.6197ms      2946  1.9070us  1.5040us  8.8970us  cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
  0.02%  4.9938ms      1564  3.1920us  1.2800us  7.0080us  cudnn::gemm::computeWgradBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
  0.02%  3.8378ms       491  7.8160us  7.4240us  9.6320us  volta_sgemm_32x32_sliced1x4_tn
  0.01%  3.3765ms       391  8.6350us  7.0090us  11.232us  _ZN2at6native13reduce_kernelILi256ELi2ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
  0.01%  3.0405ms       391  7.7760us  7.4560us  10.240us  volta_sgemm_32x32_sliced1x4_nt
  0.01%  2.8789ms      1564  1.8400us  1.3760us  5.5040us  cudnn::gemm::computeWgradSplitKOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
  0.01%  2.6288ms       491  5.3530us  4.5760us  6.7520us  void at::native::reduce_kernel<int=512, int=1, at::native::ReduceOp<float, at::native::MaxOps<float>, unsigned int, float, int=4>>(float)
  0.01%  2.1970ms       491  4.4740us  4.1920us  5.4410us  _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIlNS0_14func_wrapper_tIlZNS0_15sum_kernel_implIlllEEvRNS_14TensorIteratorEEUlllE_EEjlLi4EEEEEvT1_
  0.01%  2.1036ms       491  4.2840us  4.0000us  6.6890us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjESC_NS0_6memory15LoadWithoutCastENSD_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
  0.01%  2.0338ms       491  4.1420us  3.5520us  11.201us  void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)
  0.01%  1.9302ms       391  4.9360us  4.7360us  6.1130us  volta_sgemm_128x32_nn
  0.01%  1.7138ms       491  3.4900us  2.9440us  4.2560us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE4_clEvEUllE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
  0.01%  1.4417ms       491  2.9360us  2.4960us  5.8880us  void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const *, float const *, float*, float const *, float const *)
  0.00%  1.1803ms       391  3.0180us  2.4000us  7.3930us  void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)
  0.00%  1.0792ms       491  2.1980us  1.7280us  5.3440us  void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_forward<float, float, float, int=4, bool=1>(float*, float const *, int, int, int)
  0.00%  859.31us       491  1.7500us  1.2800us  3.2960us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14eq_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE4_clEvENKUlvE_clEvEUlllE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.00%  846.99us       391  2.1660us  1.4400us  5.2480us  void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_backward<float, float, float, int=4, bool=1>(float*, float const *, float const , int, int, int)
  0.00%  218.67us        62  3.5260us  1.1840us  36.354us  [CUDA memcpy DtoD]

@dcslin
Copy link
Author

dcslin commented Sep 30, 2020

shape of 1 batch
inputs shape torch.Size([128, 3, 32, 32])
targets shape torch.Size([128])

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment