Skip to content

Instantly share code, notes, and snippets.

@dcslin
Last active October 4, 2020 15:23
Show Gist options
  • Save dcslin/aa90e5258724cb9add9e9d67ae43e5fd to your computer and use it in GitHub Desktop.
Save dcslin/aa90e5258724cb9add9e9d67ae43e5fd to your computer and use it in GitHub Desktop.
conv+relu+fc fp32 fp16
@dcslin
Copy link
Author

dcslin commented Oct 4, 2020

training used time 0.02451 sec
nvprof python torch-examples/one_layer.py --opt-level O0

Time(%)      Time     Calls       Avg       Min       Max  Name
 36.60%  712.17us         2  356.08us  355.06us  357.11us  volta_scudnn_128x64_stridedB_splitK_interior_nn_v1
 16.38%  318.67us         1  318.67us  318.67us  318.67us  volta_sgemm_128x64_nt
  9.06%  176.33us         2  88.165us  87.205us  89.125us  void cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, int, int, int)
  3.86%  75.172us        12  6.2640us  1.1200us  53.635us  [CUDA memcpy HtoD]
  3.70%  72.067us         6  12.011us  10.881us  13.472us  void fft2d_r2c_32x32<float, bool=0, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  3.60%  70.147us         3  23.382us  22.369us  24.033us  void cudnn::detail::implicit_convolve_sgemm<float, float, int=128, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, float const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, int=128, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, float, float, int, int)
  3.17%  61.763us         1  61.763us  61.763us  61.763us  volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
  1.64%  31.938us         2  15.969us  15.969us  15.969us  void fermiPlusCgemmLDS128_batched<bool=1, bool=0, bool=0, bool=0, int=4, int=4, int=4, int=3, int=3, bool=1, bool=0>(float2* const *, float2* const *, float2* const *, float2*, float2 const *, float2 const *, int, int, int, int, int, int, __int64, __int64, __int64, float2 const *, float2 const *, float2, float2, int)
  1.31%  25.570us         1  25.570us  25.570us  25.570us  volta_gcgemm_32x32_nt
  1.21%  23.456us         2  11.728us  11.328us  12.128us  void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=0, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
  1.18%  23.010us         1  23.010us  23.010us  23.010us  volta_cgemm_32x32_tn
  1.06%  20.577us         1  20.577us  20.577us  20.577us  void cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, float const *, float const *)
  1.06%  20.544us        13  1.5800us  1.1200us  1.8880us  [CUDA memset]
  0.99%  19.361us         1  19.361us  19.361us  19.361us  volta_scudnn_128x32_relu_interior_nn_v1
  0.98%  19.138us         2  9.5690us  9.5690us  9.5690us  void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
  0.98%  18.977us         1  18.977us  18.977us  18.977us  _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
  0.92%  17.921us         1  17.921us  17.921us  17.921us  void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
  0.63%  12.321us         1  12.321us  12.321us  12.321us  void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const *, float*, int)
  0.61%  11.937us         1  11.937us  11.937us  11.937us  void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)
  0.54%  10.528us         3  3.5090us  1.6960us  4.7040us  void flip_filter<float, float>(float*, float const *, int, int, int, int)
  0.53%  10.401us         1  10.401us  10.401us  10.401us  void gemmSN_NN_kernel<float, int=128, int=2, int=4, int=8, int=3, int=4, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>>(cublasGemmSmallNParams<float const , cublasGemvTensorStridedBatched<float const >, float>)
  0.53%  10.273us         7  1.4670us  1.3120us  1.9200us  [CUDA memcpy DtoH]
  0.53%  10.272us         1  10.272us  10.272us  10.272us  void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=1>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  0.53%  10.241us         1  10.241us  10.241us  10.241us  void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)
  0.52%  10.176us         1  10.176us  10.176us  10.176us  volta_sgemm_32x32_sliced1x4_tn
  0.52%  10.176us         1  10.176us  10.176us  10.176us  void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  0.47%  9.2170us         1  9.2170us  9.2170us  9.2170us  volta_sgemm_32x32_sliced1x4_nt
  0.39%  7.6490us         1  7.6490us  7.6490us  7.6490us  _ZN2at6native13reduce_kernelILi256ELi2ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
  0.37%  7.1370us         1  7.1370us  7.1370us  7.1370us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESD_ILi1EjENS0_6memory15LoadWithoutCastENSG_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
  0.34%  6.6560us         1  6.6560us  6.6560us  6.6560us  void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
  0.34%  6.6240us         4  1.6560us  1.4400us  1.9840us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.33%  6.3360us         1  6.3360us  6.3360us  6.3360us  volta_sgemm_128x32_nn
  0.29%  5.7280us         1  5.7280us  5.7280us  5.7280us  void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)
  0.29%  5.6320us         1  5.6320us  5.6320us  5.6320us  _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIdNS0_14func_wrapper_tIdZNS0_27min_values_kernel_cuda_implIddEEvRNS_14TensorIteratorEEUlddE_EEjdLi4EEEEEvT1_
  0.29%  5.6000us         1  5.6000us  5.6000us  5.6000us  _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIdNS0_14func_wrapper_tIdZNS0_27max_values_kernel_cuda_implIddEEvRNS_14TensorIteratorEEUlddE_EEjdLi4EEEEEvT1_
  0.28%  5.5040us         3  1.8340us  1.8240us  1.8560us  compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
  0.26%  5.0560us         1  5.0560us  5.0560us  5.0560us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjESC_NS0_6memory15LoadWithoutCastENSD_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
  0.26%  5.0250us         2  2.5120us  2.3360us  2.6890us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_21threshold_kernel_implIfEEvRNS_14TensorIteratorET_S5_EUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.21%  4.0000us         1  4.0000us  4.0000us  4.0000us  void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)
  0.18%  3.5210us         1  3.5210us  3.5210us  3.5210us  void cudnn::winograd::generateWinogradTilesKernel<int=0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)
  0.17%  3.2320us         2  1.6160us  1.5040us  1.7280us  cudnn::gemm::computeWgradSplitKOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
  0.16%  3.1680us         1  3.1680us  3.1680us  3.1680us  void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)
  0.16%  3.1040us         2  1.5520us  1.4400us  1.6640us  cudnn::gemm::computeWgradBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
  0.16%  3.0720us         2  1.5360us  1.3760us  1.6960us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_15abs_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.16%  3.0400us         2  1.5200us  1.3440us  1.6960us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZZNS0_14ne_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlffE_EEvS4_RKT_EUlfE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.15%  3.0080us         1  3.0080us  3.0080us  3.0080us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__copy_if::CopyIfAgent<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<bool>, thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, NonZeroOp<bool>, int, int*>, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<bool>, thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, NonZeroOp<bool>, int, int*, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long>(thrust::use_default, thrust::use_default, thrust::use_default, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, bool, thrust::device_ptr<bool>, long, thrust::device_ptr<long>)
  0.15%  2.9760us         2  1.4880us  1.3440us  1.6320us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZZNS0_14gt_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE1_clEvENKUlvE_clEvEUlddE_EEvS4_RKT_EUldE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.15%  2.9120us         1  2.9120us  2.9120us  2.9120us  void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const *, float const *, float*, float const *, float const *)
  0.15%  2.8800us         1  2.8800us  2.8800us  2.8800us  _ZN2at6native24index_elementwise_kernelILi128ELi4EZNS0_16gpu_index_kernelIZNS0_17index_kernel_implINS0_10OpaqueTypeILi4EEEEEvRNS_14TensorIteratorEN3c108ArrayRefIlEESA_EUlPcSB_lE_EEvS7_SA_SA_RKT_EUliE_EEviT1_
  0.15%  2.8490us         1  2.8490us  2.8490us  2.8490us  void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)
  0.15%  2.8480us         1  2.8480us  2.8480us  2.8480us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE0_clEvENKUlvE1_clEvEUlddE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.15%  2.8480us         1  2.8480us  2.8480us  2.8480us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE1_clEvEUldE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
  0.13%  2.5280us         2  1.2640us  1.1840us  1.3440us  void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
  0.12%  2.3040us         1  2.3040us  2.3040us  2.3040us  void _GLOBAL__N__54_tmpxft_00002350_00000000_10_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_forward<float, float, float, int=4, bool=1>(float*, float const *, int, int, int)
  0.10%  1.9520us         1  1.9520us  1.9520us  1.9520us  void _GLOBAL__N__54_tmpxft_00002350_00000000_10_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_backward<float, float, float, int=4, bool=1>(float*, float const *, float const , int, int, int)
  0.10%  1.8880us         1  1.8880us  1.8880us  1.8880us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, thrust::cuda_cub::__transform::no_stencil_tag, idx_functor, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, thrust::cuda_cub::__transform::no_stencil_tag, idx_functor, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::device_ptr<long>, thrust::device_ptr<long>)
  0.10%  1.8560us         1  1.8560us  1.8560us  1.8560us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_15mul_kernel_cudaERNS_14TensorIteratorEEUlbbE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.09%  1.7920us         1  1.7920us  1.7920us  1.7920us  cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
  0.09%  1.7600us         1  1.7600us  1.7600us  1.7600us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23bitwise_and_kernel_cudaERNS_14TensorIteratorEEUlbbE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.09%  1.6640us         1  1.6640us  1.6640us  1.6640us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZZNS0_14lt_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE1_clEvENKUlvE_clEvEUlddE_EEvS4_RKT_EUldE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.08%  1.6320us         1  1.6320us  1.6320us  1.6320us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14ne_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.08%  1.6320us         1  1.6320us  1.6320us  1.6320us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14eq_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.08%  1.5360us         1  1.5360us  1.5360us  1.5360us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16ceil_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.07%  1.3770us         1  1.3770us  1.3770us  1.3770us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
  0.07%  1.2800us         1  1.2800us  1.2800us  1.2800us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__copy_if::InitAgent<thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*, int>, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long, int*>(bool=1, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*)

@dcslin
Copy link
Author

dcslin commented Oct 4, 2020

nvprof python torch-examples/one_layer.py --opt-level O3
training used time 0.02622 sec

Time(%)      Time     Calls       Avg       Min       Max  Name
 29.57%  1.0583ms         2  529.15us  527.90us  530.40us  volta_fp16_scudnn_fp16_128x128_stridedB_splitK_interior_nn_v1
 17.72%  634.18us         2  317.09us  315.86us  318.32us  volta_sgemm_128x64_nt
 14.96%  535.39us         1  535.39us  535.39us  535.39us  turing_s1689cudnn_fp16_128x128_ldg8_wgrad_idx_exp_interior_nhwc_nt_v1
 13.24%  473.75us         3  157.92us  156.55us  159.72us  void wgrad2d_grouped_direct_kernel<__half, float, float, cudnnTensorFormat_t=0>(cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, cudnnConvolutionStruct, cudnnFilterStruct, __half*, float, float, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, int)
  2.07%  74.116us        12  6.1760us  1.1200us  52.835us  [CUDA memcpy HtoD]
  1.80%  64.547us         1  64.547us  64.547us  64.547us  turing_fp16_s1688cudnn_fp16_256x64_ldg8_relu_f2f_exp_interior_nhwc_tn_v1
  1.62%  57.955us         1  57.955us  57.955us  57.955us  void cudnn::winograd::winograd3x3Kernel<__half, float, int=1, int=4, int=8, bool=1>(cudnn::maxwell::winograd::KernelParams)
  1.33%  47.650us         4  11.912us  11.009us  12.992us  void fft2d_r2c_32x32<__half, bool=0, unsigned int=0, bool=0>(float2*, __half const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  1.30%  46.531us         2  23.265us  23.138us  23.393us  void cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, __half const *, int, __half*, cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, __half, __half, int, int)
  0.95%  34.018us         2  17.009us  16.417us  17.601us  void cudnn::winograd_nonfused::winogradWgradData4x4<float, __half>(cudnn::winograd_nonfused::WinogradDataParams<float, __half>)
  0.91%  32.579us         4  8.1440us  2.0160us  11.297us  void nchwToNhwcKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
  0.82%  29.409us         3  9.8030us  9.6000us  10.176us  void gemmSN_NN_kernel<float, int=128, int=2, int=4, int=8, int=3, int=4, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>>(cublasGemmSmallNParams<float const , cublasGemvTensorStridedBatched<float const >, float>)
  0.73%  25.953us        16  1.6220us  1.1520us  1.9840us  [CUDA memset]
  0.69%  24.737us         1  24.737us  24.737us  24.737us  volta_gcgemm_32x32_nt
  0.67%  23.937us         2  11.968us  11.585us  12.352us  void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, __half>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, __half>)
  0.65%  23.362us         1  23.362us  23.362us  23.362us  volta_cgemm_32x32_tn
  0.65%  23.137us         2  11.568us  10.849us  12.288us  void fft2d_c2r_32x32<__half, bool=0, bool=0, unsigned int=0, bool=0, bool=0>(__half*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, __half*, __half*, int2, int, int)
  0.62%  22.081us         1  22.081us  22.081us  22.081us  void cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>(int, int, int, __half const *, int, __half const , int, cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, __half const *, __half const *)
  0.57%  20.353us         2  10.176us  9.9200us  10.433us  void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, __half>(cudnn::winograd_nonfused::WinogradDeltaParams<float, __half>)
  0.55%  19.809us         1  19.809us  19.809us  19.809us  _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIN3c104HalfENS0_14func_wrapper_tIS4_ZNS0_15sum_kernel_implIS4_fS4_EEvRNS_14TensorIteratorEEUlffE_EEjS4_Li4EEEEEvT1_
  0.55%  19.650us         1  19.650us  19.650us  19.650us  volta_fp16_scudnn_fp16_128x32_relu_interior_nn_v1
  0.54%  19.458us         6  3.2430us  1.6960us  4.5770us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE8_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESE_NS0_6memory12LoadWithCastILi1EEENSF_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
  0.48%  17.312us         3  5.7700us  5.2160us  6.8160us  void cudnn::winograd_nonfused::winogradForwardData4x4<float, __half>(cudnn::winograd_nonfused::WinogradDataParams<float, __half>)
  0.45%  16.161us         1  16.161us  16.161us  16.161us  void fermiPlusCgemmLDS128_batched<bool=1, bool=0, bool=0, bool=0, int=4, int=4, int=4, int=3, int=3, bool=1, bool=0>(float2* const *, float2* const *, float2* const *, float2*, float2 const *, float2 const *, int, int, int, int, int, int, __int64, __int64, __int64, float2 const *, float2 const *, float2, float2, int)
  0.43%  15.361us         3  5.1200us  4.8320us  5.6640us  void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, __half>(cudnn::winograd_nonfused::WinogradOutputParams<float, __half>)
  0.29%  10.273us         1  10.273us  10.273us  10.273us  void fft2d_r2c_32x32<__half, bool=0, unsigned int=1, bool=0>(float2*, __half const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  0.28%  10.112us         1  10.112us  10.112us  10.112us  void fft2d_r2c_32x32<__half, bool=0, unsigned int=1, bool=1>(float2*, __half const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  0.28%  9.8890us         7  1.4120us  1.2800us  1.7920us  [CUDA memcpy DtoH]
  0.27%  9.6010us         1  9.6010us  9.6010us  9.6010us  void nhwcToNchwKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
  0.27%  9.5360us         1  9.5360us  9.5360us  9.5360us  void fft2d_c2r_32x32<__half, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(__half*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, __half*, __half*, int2, int, int)
  0.26%  9.1840us         1  9.1840us  9.1840us  9.1840us  volta_fp16_sgemm_fp16_64x32_sliced1x4_nt
  0.25%  9.1210us         1  9.1210us  9.1210us  9.1210us  void im2col4d_kernel<__half, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, __half const *, __half*, int)
  0.23%  8.2570us         1  8.2570us  8.2570us  8.2570us  volta_sgemm_fp16_32x128_tn
  0.23%  8.1930us         1  8.1930us  8.1930us  8.1930us  _ZN2at6native13reduce_kernelILi256ELi2ENS0_8ReduceOpIN3c104HalfENS0_14func_wrapper_tIS4_ZNS0_15sum_kernel_implIS4_fS4_EEvRNS_14TensorIteratorEEUlffE_EEjS4_Li4EEEEEvT1_
  0.21%  7.5200us         3  2.5060us  2.0800us  3.2320us  void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, __half>(cudnn::winograd_nonfused::WinogradFilterParams<float, __half>)
  0.20%  7.0730us         1  7.0730us  7.0730us  7.0730us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE8_clEvEUlNS4_4HalfES8_E_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESE_ILi1EjENS0_6memory15LoadWithoutCastENSH_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
  0.19%  6.9120us         1  6.9120us  6.9120us  6.9120us  volta_fp16_sgemm_fp16_128x32_nn
  0.19%  6.6250us         4  1.6560us  1.3760us  2.0160us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE8_clEvEUlNS4_4HalfES8_E_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.15%  5.5360us         1  5.5360us  5.5360us  5.5360us  _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIdNS0_14func_wrapper_tIdZNS0_27min_values_kernel_cuda_implIddEEvRNS_14TensorIteratorEEUlddE_EEjdLi4EEEEEvT1_
  0.15%  5.5040us         2  2.7520us  1.5680us  3.9360us  void flip_filter<__half, __half>(__half*, __half const *, int, int, int, int)
  0.15%  5.5040us         1  5.5040us  5.5040us  5.5040us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
  0.15%  5.3760us         1  5.3760us  5.3760us  5.3760us  _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIdNS0_14func_wrapper_tIdZNS0_27max_values_kernel_cuda_implIddEEvRNS_14TensorIteratorEEUlddE_EEjdLi4EEEEEvT1_
  0.15%  5.2800us         1  5.2800us  5.2800us  5.2800us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE8_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjESE_NS0_6memory15LoadWithoutCastENSF_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
  0.12%  4.3520us         1  4.3520us  4.3520us  4.3520us  void nhwcToNchwKernel<float, __half, float, bool=1, bool=1>(int, int, int, int, float const *, __half*, float, float)
  0.12%  4.2250us         2  2.1120us  2.1120us  2.1130us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_21threshold_kernel_implIN3c104HalfEEEvRNS_14TensorIteratorET_S7_EUlS4_S4_E_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.10%  3.7120us         1  3.7120us  3.7120us  3.7120us  void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)
  0.10%  3.5520us         2  1.7760us  1.6640us  1.8880us  compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
  0.10%  3.4560us         1  3.4560us  3.4560us  3.4560us  void splitKreduce_kernel<float, __half, float>(cublasSplitKParams<float>, float const *, __half const *, __half*, float const *, float const *)
  0.10%  3.4250us         1  3.4250us  3.4250us  3.4250us  void cudnn::winograd::generateWinogradTilesKernel<int=0, __half, float>(cudnn::winograd::GenerateWinogradTilesParams<__half, float>)
  0.09%  3.2330us         2  1.6160us  1.4080us  1.8250us  cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
  0.09%  3.2320us         2  1.6160us  1.4720us  1.7600us  cudnn::gemm::computeWgradSplitKOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
  0.08%  3.0400us         2  1.5200us  1.3760us  1.6640us  cudnn::gemm::computeWgradBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
  0.08%  2.9760us         2  1.4880us  1.3440us  1.6320us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZZNS0_14gt_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE1_clEvENKUlvE_clEvEUlddE_EEvS4_RKT_EUldE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.08%  2.9120us         1  2.9120us  2.9120us  2.9120us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE0_clEvENKUlvE1_clEvEUlddE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.08%  2.8800us         2  1.4400us  1.3120us  1.5680us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZZNS0_14ne_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlffE_EEvS4_RKT_EUlfE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.08%  2.8800us         2  1.4400us  1.3760us  1.5040us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_15abs_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.08%  2.8490us         1  2.8490us  2.8490us  2.8490us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE1_clEvEUldE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
  0.08%  2.8490us         1  2.8490us  2.8490us  2.8490us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__copy_if::CopyIfAgent<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<bool>, thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, NonZeroOp<bool>, int, int*>, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<bool>, thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, NonZeroOp<bool>, int, int*, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long>(thrust::use_default, thrust::use_default, thrust::use_default, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, bool, thrust::device_ptr<bool>, long, thrust::device_ptr<long>)
  0.08%  2.7840us         1  2.7840us  2.7840us  2.7840us  _ZN2at6native24index_elementwise_kernelILi128ELi4EZNS0_16gpu_index_kernelIZNS0_17index_kernel_implINS0_10OpaqueTypeILi4EEEEEvRNS_14TensorIteratorEN3c108ArrayRefIlEESA_EUlPcSB_lE_EEvS7_SA_SA_RKT_EUliE_EEviT1_
  0.08%  2.6880us         1  2.6880us  2.6880us  2.6880us  void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)
  0.07%  2.5600us         2  1.2800us  1.2160us  1.3440us  void scalePackedTensor_kernel<__half, float>(cudnnTensor4dStruct, __half*, float)
  0.06%  2.2720us         1  2.2720us  2.2720us  2.2720us  void _GLOBAL__N__54_tmpxft_00002350_00000000_10_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_forward<c10::Half, c10::Half, float, int=4, bool=1>(c10::Half*, c10::Half const *, int, int, int)
  0.05%  1.8880us         1  1.8880us  1.8880us  1.8880us  void _GLOBAL__N__54_tmpxft_00002350_00000000_10_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_backward<c10::Half, c10::Half, float, int=4, bool=1>(c10::Half*, c10::Half const *, c10::Half const , int, int, int)
  0.05%  1.8560us         1  1.8560us  1.8560us  1.8560us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, thrust::cuda_cub::__transform::no_stencil_tag, idx_functor, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, thrust::cuda_cub::__transform::no_stencil_tag, idx_functor, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::device_ptr<long>, thrust::device_ptr<long>)
  0.05%  1.7920us         1  1.7920us  1.7920us  1.7920us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_15mul_kernel_cudaERNS_14TensorIteratorEEUlbbE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.05%  1.7600us         1  1.7600us  1.7600us  1.7600us  cudnn::gemm::computeWgradOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
  0.05%  1.7600us         1  1.7600us  1.7600us  1.7600us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23bitwise_and_kernel_cudaERNS_14TensorIteratorEEUlbbE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.05%  1.6960us         1  1.6960us  1.6960us  1.6960us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZZNS0_14lt_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE1_clEvENKUlvE_clEvEUlddE_EEvS4_RKT_EUldE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.05%  1.6960us         1  1.6960us  1.6960us  1.6960us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14eq_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.05%  1.6320us         1  1.6320us  1.6320us  1.6320us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16ceil_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.05%  1.6320us         1  1.6320us  1.6320us  1.6320us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14ne_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.04%  1.3760us         1  1.3760us  1.3760us  1.3760us  void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
  0.04%  1.3120us         1  1.3120us  1.3120us  1.3120us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
  0.04%  1.3120us         1  1.3120us  1.3120us  1.3120us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__copy_if::InitAgent<thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*, int>, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long, int*>(bool=1, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*)

@dcslin
Copy link
Author

dcslin commented Oct 4, 2020

training used time 0.01820 sec
python examples/cnn/train_cnn_half.py -pfloat16

Time(%)      Time     Calls       Avg       Min       Max  Name
 57.18%  1.1399ms         1  1.1399ms  1.1399ms  1.1399ms  generate_seed_pseudo(__int64, __int64, __int64, curandOrdering, curandStateXORWOW*, unsigned int*)
 17.24%  343.60us         1  343.60us  343.60us  343.60us  volta_hcudnn_128x128_stridedB_splitK_interior_nn_v1
  5.10%  101.73us         1  101.73us  101.73us  101.73us  volta_h884cudnn_256x128_ldg8_dgrad_exp_small_nhwc_tt_v1
  4.06%  80.868us         1  80.868us  80.868us  80.868us  void calc_bias_diff<int=2, __half, float, int=128, int=0>(cudnnTensorStruct, __half const *, cudnnTensorStruct, __half*, float, float, int)
  3.53%  70.436us         1  70.436us  70.436us  70.436us  turing_h1688cudnn_256x64_sliced1x2_ldg8_relu_exp_interior_nhwc_tn_v1
  1.27%  25.410us         2  12.705us  8.9290us  16.481us  volta_hgemm_128x64_nn
  1.27%  25.314us         4  6.3280us  1.9200us  11.905us  void nchwToNhwcKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
  0.98%  19.585us         2  9.7920us  9.6010us  9.9840us  void nhwcToNchwKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
  0.85%  17.025us        12  1.4180us  1.2800us  1.7600us  void setTensor4d_kernel<float, float, int=16, int=16>(cudnnTensor4dStruct, float*, float)
  0.84%  16.833us         1  16.833us  16.833us  16.833us  volta_hgemm_128x128_nt
  0.78%  15.456us        11  1.4050us  1.1200us  3.3600us  [CUDA memcpy HtoD]
  0.75%  14.880us         9  1.6530us  1.1520us  2.2400us  [CUDA memset]
  0.72%  14.370us        11  1.3060us  1.2160us  1.5680us  [CUDA memcpy DtoD]
  0.67%  13.441us         1  13.441us  13.441us  13.441us  volta_hgemm_128x128_tn
  0.62%  12.384us         7  1.7690us  1.2480us  4.2560us  singa::cuda::KernelFloat2Half(unsigned long, float const *, __half*)
  0.39%  7.8720us         4  1.9680us  1.5680us  2.2400us  void axpy_kernel_ref<__half, float>(cublasAxpyParamsRef<__half, __half, float>)
  0.29%  5.7280us         4  1.4320us  1.2480us  1.5360us  singa::cuda::KernelSub(unsigned long, float const *, float const *, float*)
  0.28%  5.6320us         3  1.8770us  1.2480us  3.1040us  singa::cuda::KernelMult(unsigned long, float const *, float, float*)
  0.26%  5.2160us         2  2.6080us  1.6960us  3.5200us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
  0.23%  4.5440us         1  4.5440us  4.5440us  4.5440us  void cudnn::detail::softmax_fw_kernel<int=2, __half, float, int=256, int=1, int=0, int=0>(cudnnTensorStruct, __half const *, cudnn::detail::softmax_fw_kernel<int=2, __half, float, int=256, int=1, int=0, int=0>, cudnnTensorStruct*, int, float, cudnnTensorStruct*, int, int)
  0.22%  4.4480us         2  2.2240us  1.9200us  2.5280us  void op_generic_tensor_kernel<int=1, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
  0.22%  4.4160us         2  2.2080us  1.2800us  3.1360us  singa::cuda::KernelAdd(unsigned long, float const *, float, float*)
  0.19%  3.8410us         1  3.8410us  3.8410us  3.8410us  void gemv2N_kernel<int, int, float, float, float, int=128, int=32, int=4, int=4, int=1, cublasGemvParams<cublasGemvTensor<float const >, cublasGemvTensor<float>, float>>(float const )
  0.19%  3.8400us         1  3.8400us  3.8400us  3.8400us  void op_generic_tensor_kernel<int=4, __half, float, __half, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, float, float, float, float, reducedDivisorArray, int)
  0.17%  3.4240us         2  1.7120us  1.5680us  1.8560us  cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
  0.16%  3.1360us         1  3.1360us  3.1360us  3.1360us  void splitKreduce_kernel<__half, __half, __half>(cublasSplitKParams<__half>, __half const *, __half const *, __half*, __half const *, __half const *)
  0.15%  3.0090us         2  1.5040us  1.2800us  1.7290us  singa::cuda::KernelMult(unsigned long, __half const *, __half, __half*)
  0.15%  2.9120us         1  2.9120us  2.9120us  2.9120us  singa::cuda::KernelReLUBackward(unsigned long, __half const *, __half const *, __half*)
  0.13%  2.4960us         1  2.4960us  2.4960us  2.4960us  singa::cuda::KernelRelu(unsigned long, __half const *, __half*)
  0.12%  2.4320us         1  2.4320us  2.4320us  2.4320us  void reduce_1Block_kernel<float, int=128, int=7, cublasGemvTensorStridedBatched<float>, cublasGemvTensor<__half>>(float const *, float, float, int, float const *, float, cublasGemvTensorStridedBatched<float>, cublasPointerMode_t)
  0.11%  2.2720us         1  2.2720us  2.2720us  2.2720us  singa::cuda::KernelComputeCrossEntropy(bool, unsigned long, unsigned long, __half const *, int const *, __half*)
  0.11%  2.2720us         1  2.2720us  2.2720us  2.2720us  void dot_kernel<float, int=128, int=0, cublasDotParams<cublasGemvTensor<__half const >, cublasGemvTensorStridedBatched<float>>>(__half const )
  0.11%  2.1120us         1  2.1120us  2.1120us  2.1120us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
  0.09%  1.7600us         1  1.7600us  1.7600us  1.7600us  singa::cuda::KernelSoftmaxCrossEntropyBwd(bool, unsigned long, unsigned long, __half const *, int const *, __half*)
  0.09%  1.7280us         1  1.7280us  1.7280us  1.7280us  cudnn::gemm::computeWgradSplitKOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
  0.08%  1.6650us         1  1.6650us  1.6650us  1.6650us  [CUDA memcpy DtoH]
  0.08%  1.6640us         1  1.6640us  1.6640us  1.6640us  cudnn::gemm::computeWgradBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
  0.08%  1.5680us         1  1.5680us  1.5680us  1.5680us  singa::cuda::KernelHalf2Float(unsigned long, __half const *, float*)
  0.08%  1.5040us         1  1.5040us  1.5040us  1.5040us  singa::cuda::KernelCastFloat2Int(unsigned long, float const *, int*)
  0.07%  1.4720us         1  1.4720us  1.4720us  1.4720us  cudnn::gemm::computeBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
  0.07%  1.3440us         1  1.3440us  1.3440us  1.3440us  void scalePackedTensor_kernel<__half, float>(cudnnTensor4dStruct, __half*, float)

@dcslin
Copy link
Author

dcslin commented Oct 4, 2020

training used time 0.01540 sec
nvprof python examples/cnn/train_cnn_half.py -pfloat32

Time(%)      Time     Calls       Avg       Min       Max  Name
 73.79%  1.1260ms         1  1.1260ms  1.1260ms  1.1260ms  generate_seed_pseudo(__int64, __int64, __int64, curandOrdering, curandStateXORWOW*, unsigned int*)
  5.29%  80.708us         1  80.708us  80.708us  80.708us  void calc_bias_diff<int=2, float, float, int=128, int=0>(cudnnTensorStruct, float const *, cudnnTensorStruct, float*, float, float, int)
  2.63%  40.099us         1  40.099us  40.099us  40.099us  void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>*, kernel_grad_params, int, int, float, int, int, int)
  2.28%  34.721us         3  11.573us  10.848us  12.800us  void fft2d_r2c_32x32<float, bool=0, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  1.65%  25.121us         1  25.121us  25.121us  25.121us  volta_gcgemm_32x32_nt
  1.32%  20.160us        14  1.4400us  1.2800us  1.9520us  void setTensor4d_kernel<float, float, int=16, int=16>(cudnnTensor4dStruct, float*, float)
  1.29%  19.617us         9  2.1790us  1.1200us  10.017us  [CUDA memcpy HtoD]
  1.22%  18.594us        13  1.4300us  1.1840us  3.5210us  [CUDA memcpy DtoD]
  1.04%  15.905us         1  15.905us  15.905us  15.905us  void fermiPlusCgemmLDS128_batched<bool=1, bool=0, bool=0, bool=0, int=4, int=4, int=4, int=3, int=3, bool=1, bool=0>(float2* const *, float2* const *, float2* const *, float2*, float2 const *, float2 const *, int, int, int, int, int, int, __int64, __int64, __int64, float2 const *, float2 const *, float2, float2, int)
  0.78%  11.872us         1  11.872us  11.872us  11.872us  void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=0, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
  0.68%  10.369us         1  10.369us  10.369us  10.369us  void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=1>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  0.63%  9.6320us         1  9.6320us  9.6320us  9.6320us  volta_sgemm_32x32_sliced1x4_nn
  0.62%  9.4720us         1  9.4720us  9.4720us  9.4720us  void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
  0.61%  9.2490us         1  9.2490us  9.2490us  9.2490us  volta_sgemm_32x32_sliced1x4_nt
  0.56%  8.4800us         5  1.6960us  1.2480us  3.2320us  singa::cuda::KernelMult(unsigned long, float const *, float, float*)
  0.51%  7.7760us         4  1.9440us  1.7280us  2.2720us  [CUDA memset]
  0.48%  7.3600us         4  1.8400us  1.4720us  2.0160us  void axpy_kernel_ref<float, float>(cublasAxpyParamsRef<float, float, float>)
  0.42%  6.4000us         1  6.4000us  6.4000us  6.4000us  volta_sgemm_128x32_tn
  0.38%  5.7280us         4  1.4320us  1.2800us  1.6640us  singa::cuda::KernelSub(unsigned long, float const *, float const *, float*)
  0.36%  5.4400us         2  2.7200us  1.7920us  3.6480us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
  0.29%  4.3850us         2  2.1920us  1.2480us  3.1370us  singa::cuda::KernelAdd(unsigned long, float const *, float, float*)
  0.27%  4.1600us         1  4.1600us  4.1600us  4.1600us  void flip_filter<float, float>(float*, float const *, int, int, int, int)
  0.26%  4.0320us         2  2.0160us  2.0160us  2.0160us  void op_generic_tensor_kernel<int=1, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
  0.25%  3.8730us         1  3.8730us  3.8730us  3.8730us  void gemv2N_kernel<int, int, float, float, float, int=128, int=32, int=4, int=4, int=1, cublasGemvParams<cublasGemvTensor<float const >, cublasGemvTensor<float>, float>>(float const )
  0.25%  3.8400us         1  3.8400us  3.8400us  3.8400us  void op_generic_tensor_kernel<int=4, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
  0.25%  3.7770us         1  3.7770us  3.7770us  3.7770us  void cudnn::detail::softmax_fw_kernel_resident<int=2, float, float, int=256, int=1, int=0, int=0, int=32, int=0>(cudnnTensorStruct, float const *, cudnn::detail::softmax_fw_kernel_resident<int=2, float, float, int=256, int=1, int=0, int=0, int=32, int=0>, float*, int, float, float*, int, int)
  0.20%  3.1040us         1  3.1040us  3.1040us  3.1040us  singa::cuda::KernelReLUBackward(unsigned long, float const *, float const *, float*)
  0.20%  3.0400us         1  3.0400us  3.0400us  3.0400us  void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const *, float const *, float*, float const *, float const *)
  0.17%  2.6240us         1  2.6240us  2.6240us  2.6240us  void gemmk1_kernel<float, int=256, int=5, bool=0, bool=0, bool=0, bool=0, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>, float>(cublasGemmk1Params<float, float const , cublasGemvTensorStridedBatched<float const >, float>)
  0.17%  2.5600us         1  2.5600us  2.5600us  2.5600us  singa::cuda::KernelRelu(unsigned long, float const *, float*)
  0.16%  2.4960us         1  2.4960us  2.4960us  2.4960us  void reduce_1Block_kernel<float, int=128, int=7, cublasGemvTensorStridedBatched<float>, cublasGemvTensor<float>>(float const *, float, float, int, float const *, float, cublasGemvTensorStridedBatched<float>, cublasPointerMode_t)
  0.15%  2.2720us         1  2.2720us  2.2720us  2.2720us  void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
  0.14%  2.1770us         1  2.1770us  2.1770us  2.1770us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
  0.14%  2.1760us         1  2.1760us  2.1760us  2.1760us  void dot_kernel<float, int=128, int=0, cublasDotParams<cublasGemvTensor<float const >, cublasGemvTensorStridedBatched<float>>>(float const )
  0.12%  1.8880us         1  1.8880us  1.8880us  1.8880us  singa::cuda::KernelComputeCrossEntropy(bool, unsigned long, unsigned long, float const *, int const *, float*)
  0.12%  1.8240us         1  1.8240us  1.8240us  1.8240us  compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
  0.12%  1.8240us         1  1.8240us  1.8240us  1.8240us  singa::cuda::KernelSoftmaxCrossEntropyBwd(bool, unsigned long, unsigned long, float const *, int const *, float*)
  0.11%  1.6320us         1  1.6320us  1.6320us  1.6320us  [CUDA memcpy DtoH]
  0.10%  1.5040us         1  1.5040us  1.5040us  1.5040us  singa::cuda::KernelCastFloat2Int(unsigned long, float const *, int*)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment