Last active
October 4, 2020 15:23
-
-
Save dcslin/aa90e5258724cb9add9e9d67ae43e5fd to your computer and use it in GitHub Desktop.
conv+relu+fc fp32 fp16
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pass |
nvprof python torch-examples/one_layer.py --opt-level O3
training used time 0.02622 sec
Time(%) Time Calls Avg Min Max Name
29.57% 1.0583ms 2 529.15us 527.90us 530.40us volta_fp16_scudnn_fp16_128x128_stridedB_splitK_interior_nn_v1
17.72% 634.18us 2 317.09us 315.86us 318.32us volta_sgemm_128x64_nt
14.96% 535.39us 1 535.39us 535.39us 535.39us turing_s1689cudnn_fp16_128x128_ldg8_wgrad_idx_exp_interior_nhwc_nt_v1
13.24% 473.75us 3 157.92us 156.55us 159.72us void wgrad2d_grouped_direct_kernel<__half, float, float, cudnnTensorFormat_t=0>(cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, cudnnConvolutionStruct, cudnnFilterStruct, __half*, float, float, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, int)
2.07% 74.116us 12 6.1760us 1.1200us 52.835us [CUDA memcpy HtoD]
1.80% 64.547us 1 64.547us 64.547us 64.547us turing_fp16_s1688cudnn_fp16_256x64_ldg8_relu_f2f_exp_interior_nhwc_tn_v1
1.62% 57.955us 1 57.955us 57.955us 57.955us void cudnn::winograd::winograd3x3Kernel<__half, float, int=1, int=4, int=8, bool=1>(cudnn::maxwell::winograd::KernelParams)
1.33% 47.650us 4 11.912us 11.009us 12.992us void fft2d_r2c_32x32<__half, bool=0, unsigned int=0, bool=0>(float2*, __half const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
1.30% 46.531us 2 23.265us 23.138us 23.393us void cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, __half const *, int, __half*, cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, __half, __half, int, int)
0.95% 34.018us 2 17.009us 16.417us 17.601us void cudnn::winograd_nonfused::winogradWgradData4x4<float, __half>(cudnn::winograd_nonfused::WinogradDataParams<float, __half>)
0.91% 32.579us 4 8.1440us 2.0160us 11.297us void nchwToNhwcKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
0.82% 29.409us 3 9.8030us 9.6000us 10.176us void gemmSN_NN_kernel<float, int=128, int=2, int=4, int=8, int=3, int=4, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>>(cublasGemmSmallNParams<float const , cublasGemvTensorStridedBatched<float const >, float>)
0.73% 25.953us 16 1.6220us 1.1520us 1.9840us [CUDA memset]
0.69% 24.737us 1 24.737us 24.737us 24.737us volta_gcgemm_32x32_nt
0.67% 23.937us 2 11.968us 11.585us 12.352us void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, __half>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, __half>)
0.65% 23.362us 1 23.362us 23.362us 23.362us volta_cgemm_32x32_tn
0.65% 23.137us 2 11.568us 10.849us 12.288us void fft2d_c2r_32x32<__half, bool=0, bool=0, unsigned int=0, bool=0, bool=0>(__half*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, __half*, __half*, int2, int, int)
0.62% 22.081us 1 22.081us 22.081us 22.081us void cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>(int, int, int, __half const *, int, __half const , int, cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, __half const *, __half const *)
0.57% 20.353us 2 10.176us 9.9200us 10.433us void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, __half>(cudnn::winograd_nonfused::WinogradDeltaParams<float, __half>)
0.55% 19.809us 1 19.809us 19.809us 19.809us _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIN3c104HalfENS0_14func_wrapper_tIS4_ZNS0_15sum_kernel_implIS4_fS4_EEvRNS_14TensorIteratorEEUlffE_EEjS4_Li4EEEEEvT1_
0.55% 19.650us 1 19.650us 19.650us 19.650us volta_fp16_scudnn_fp16_128x32_relu_interior_nn_v1
0.54% 19.458us 6 3.2430us 1.6960us 4.5770us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE8_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESE_NS0_6memory12LoadWithCastILi1EEENSF_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
0.48% 17.312us 3 5.7700us 5.2160us 6.8160us void cudnn::winograd_nonfused::winogradForwardData4x4<float, __half>(cudnn::winograd_nonfused::WinogradDataParams<float, __half>)
0.45% 16.161us 1 16.161us 16.161us 16.161us void fermiPlusCgemmLDS128_batched<bool=1, bool=0, bool=0, bool=0, int=4, int=4, int=4, int=3, int=3, bool=1, bool=0>(float2* const *, float2* const *, float2* const *, float2*, float2 const *, float2 const *, int, int, int, int, int, int, __int64, __int64, __int64, float2 const *, float2 const *, float2, float2, int)
0.43% 15.361us 3 5.1200us 4.8320us 5.6640us void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, __half>(cudnn::winograd_nonfused::WinogradOutputParams<float, __half>)
0.29% 10.273us 1 10.273us 10.273us 10.273us void fft2d_r2c_32x32<__half, bool=0, unsigned int=1, bool=0>(float2*, __half const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
0.28% 10.112us 1 10.112us 10.112us 10.112us void fft2d_r2c_32x32<__half, bool=0, unsigned int=1, bool=1>(float2*, __half const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
0.28% 9.8890us 7 1.4120us 1.2800us 1.7920us [CUDA memcpy DtoH]
0.27% 9.6010us 1 9.6010us 9.6010us 9.6010us void nhwcToNchwKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
0.27% 9.5360us 1 9.5360us 9.5360us 9.5360us void fft2d_c2r_32x32<__half, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(__half*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, __half*, __half*, int2, int, int)
0.26% 9.1840us 1 9.1840us 9.1840us 9.1840us volta_fp16_sgemm_fp16_64x32_sliced1x4_nt
0.25% 9.1210us 1 9.1210us 9.1210us 9.1210us void im2col4d_kernel<__half, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, __half const *, __half*, int)
0.23% 8.2570us 1 8.2570us 8.2570us 8.2570us volta_sgemm_fp16_32x128_tn
0.23% 8.1930us 1 8.1930us 8.1930us 8.1930us _ZN2at6native13reduce_kernelILi256ELi2ENS0_8ReduceOpIN3c104HalfENS0_14func_wrapper_tIS4_ZNS0_15sum_kernel_implIS4_fS4_EEvRNS_14TensorIteratorEEUlffE_EEjS4_Li4EEEEEvT1_
0.21% 7.5200us 3 2.5060us 2.0800us 3.2320us void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, __half>(cudnn::winograd_nonfused::WinogradFilterParams<float, __half>)
0.20% 7.0730us 1 7.0730us 7.0730us 7.0730us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE8_clEvEUlNS4_4HalfES8_E_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESE_ILi1EjENS0_6memory15LoadWithoutCastENSH_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
0.19% 6.9120us 1 6.9120us 6.9120us 6.9120us volta_fp16_sgemm_fp16_128x32_nn
0.19% 6.6250us 4 1.6560us 1.3760us 2.0160us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE8_clEvEUlNS4_4HalfES8_E_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.15% 5.5360us 1 5.5360us 5.5360us 5.5360us _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIdNS0_14func_wrapper_tIdZNS0_27min_values_kernel_cuda_implIddEEvRNS_14TensorIteratorEEUlddE_EEjdLi4EEEEEvT1_
0.15% 5.5040us 2 2.7520us 1.5680us 3.9360us void flip_filter<__half, __half>(__half*, __half const *, int, int, int, int)
0.15% 5.5040us 1 5.5040us 5.5040us 5.5040us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
0.15% 5.3760us 1 5.3760us 5.3760us 5.3760us _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIdNS0_14func_wrapper_tIdZNS0_27max_values_kernel_cuda_implIddEEvRNS_14TensorIteratorEEUlddE_EEjdLi4EEEEEvT1_
0.15% 5.2800us 1 5.2800us 5.2800us 5.2800us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE8_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjESE_NS0_6memory15LoadWithoutCastENSF_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
0.12% 4.3520us 1 4.3520us 4.3520us 4.3520us void nhwcToNchwKernel<float, __half, float, bool=1, bool=1>(int, int, int, int, float const *, __half*, float, float)
0.12% 4.2250us 2 2.1120us 2.1120us 2.1130us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_21threshold_kernel_implIN3c104HalfEEEvRNS_14TensorIteratorET_S7_EUlS4_S4_E_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.10% 3.7120us 1 3.7120us 3.7120us 3.7120us void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)
0.10% 3.5520us 2 1.7760us 1.6640us 1.8880us compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
0.10% 3.4560us 1 3.4560us 3.4560us 3.4560us void splitKreduce_kernel<float, __half, float>(cublasSplitKParams<float>, float const *, __half const *, __half*, float const *, float const *)
0.10% 3.4250us 1 3.4250us 3.4250us 3.4250us void cudnn::winograd::generateWinogradTilesKernel<int=0, __half, float>(cudnn::winograd::GenerateWinogradTilesParams<__half, float>)
0.09% 3.2330us 2 1.6160us 1.4080us 1.8250us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.09% 3.2320us 2 1.6160us 1.4720us 1.7600us cudnn::gemm::computeWgradSplitKOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.08% 3.0400us 2 1.5200us 1.3760us 1.6640us cudnn::gemm::computeWgradBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
0.08% 2.9760us 2 1.4880us 1.3440us 1.6320us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZZNS0_14gt_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE1_clEvENKUlvE_clEvEUlddE_EEvS4_RKT_EUldE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
0.08% 2.9120us 1 2.9120us 2.9120us 2.9120us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE0_clEvENKUlvE1_clEvEUlddE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.08% 2.8800us 2 1.4400us 1.3120us 1.5680us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZZNS0_14ne_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlffE_EEvS4_RKT_EUlfE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
0.08% 2.8800us 2 1.4400us 1.3760us 1.5040us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_15abs_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
0.08% 2.8490us 1 2.8490us 2.8490us 2.8490us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE1_clEvEUldE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
0.08% 2.8490us 1 2.8490us 2.8490us 2.8490us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__copy_if::CopyIfAgent<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<bool>, thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, NonZeroOp<bool>, int, int*>, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<bool>, thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, NonZeroOp<bool>, int, int*, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long>(thrust::use_default, thrust::use_default, thrust::use_default, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, bool, thrust::device_ptr<bool>, long, thrust::device_ptr<long>)
0.08% 2.7840us 1 2.7840us 2.7840us 2.7840us _ZN2at6native24index_elementwise_kernelILi128ELi4EZNS0_16gpu_index_kernelIZNS0_17index_kernel_implINS0_10OpaqueTypeILi4EEEEEvRNS_14TensorIteratorEN3c108ArrayRefIlEESA_EUlPcSB_lE_EEvS7_SA_SA_RKT_EUliE_EEviT1_
0.08% 2.6880us 1 2.6880us 2.6880us 2.6880us void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)
0.07% 2.5600us 2 1.2800us 1.2160us 1.3440us void scalePackedTensor_kernel<__half, float>(cudnnTensor4dStruct, __half*, float)
0.06% 2.2720us 1 2.2720us 2.2720us 2.2720us void _GLOBAL__N__54_tmpxft_00002350_00000000_10_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_forward<c10::Half, c10::Half, float, int=4, bool=1>(c10::Half*, c10::Half const *, int, int, int)
0.05% 1.8880us 1 1.8880us 1.8880us 1.8880us void _GLOBAL__N__54_tmpxft_00002350_00000000_10_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_backward<c10::Half, c10::Half, float, int=4, bool=1>(c10::Half*, c10::Half const *, c10::Half const , int, int, int)
0.05% 1.8560us 1 1.8560us 1.8560us 1.8560us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, thrust::cuda_cub::__transform::no_stencil_tag, idx_functor, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, thrust::permutation_iterator<thrust::device_ptr<long>, thrust::transform_iterator<strided_range<thrust::device_ptr<long>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>>, thrust::cuda_cub::__transform::no_stencil_tag, idx_functor, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::device_ptr<long>, thrust::device_ptr<long>)
0.05% 1.7920us 1 1.7920us 1.7920us 1.7920us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_15mul_kernel_cudaERNS_14TensorIteratorEEUlbbE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.05% 1.7600us 1 1.7600us 1.7600us 1.7600us cudnn::gemm::computeWgradOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.05% 1.7600us 1 1.7600us 1.7600us 1.7600us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23bitwise_and_kernel_cudaERNS_14TensorIteratorEEUlbbE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.05% 1.6960us 1 1.6960us 1.6960us 1.6960us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZZNS0_14lt_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE1_clEvENKUlvE_clEvEUlddE_EEvS4_RKT_EUldE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
0.05% 1.6960us 1 1.6960us 1.6960us 1.6960us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14eq_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.05% 1.6320us 1 1.6320us 1.6320us 1.6320us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16ceil_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
0.05% 1.6320us 1 1.6320us 1.6320us 1.6320us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14ne_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.04% 1.3760us 1 1.3760us 1.3760us 1.3760us void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
0.04% 1.3120us 1 1.3120us 1.3120us 1.3120us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
0.04% 1.3120us 1 1.3120us 1.3120us 1.3120us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__copy_if::InitAgent<thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*, int>, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long, int*>(bool=1, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*)
training used time 0.01820 sec
python examples/cnn/train_cnn_half.py -pfloat16
Time(%) Time Calls Avg Min Max Name
57.18% 1.1399ms 1 1.1399ms 1.1399ms 1.1399ms generate_seed_pseudo(__int64, __int64, __int64, curandOrdering, curandStateXORWOW*, unsigned int*)
17.24% 343.60us 1 343.60us 343.60us 343.60us volta_hcudnn_128x128_stridedB_splitK_interior_nn_v1
5.10% 101.73us 1 101.73us 101.73us 101.73us volta_h884cudnn_256x128_ldg8_dgrad_exp_small_nhwc_tt_v1
4.06% 80.868us 1 80.868us 80.868us 80.868us void calc_bias_diff<int=2, __half, float, int=128, int=0>(cudnnTensorStruct, __half const *, cudnnTensorStruct, __half*, float, float, int)
3.53% 70.436us 1 70.436us 70.436us 70.436us turing_h1688cudnn_256x64_sliced1x2_ldg8_relu_exp_interior_nhwc_tn_v1
1.27% 25.410us 2 12.705us 8.9290us 16.481us volta_hgemm_128x64_nn
1.27% 25.314us 4 6.3280us 1.9200us 11.905us void nchwToNhwcKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
0.98% 19.585us 2 9.7920us 9.6010us 9.9840us void nhwcToNchwKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
0.85% 17.025us 12 1.4180us 1.2800us 1.7600us void setTensor4d_kernel<float, float, int=16, int=16>(cudnnTensor4dStruct, float*, float)
0.84% 16.833us 1 16.833us 16.833us 16.833us volta_hgemm_128x128_nt
0.78% 15.456us 11 1.4050us 1.1200us 3.3600us [CUDA memcpy HtoD]
0.75% 14.880us 9 1.6530us 1.1520us 2.2400us [CUDA memset]
0.72% 14.370us 11 1.3060us 1.2160us 1.5680us [CUDA memcpy DtoD]
0.67% 13.441us 1 13.441us 13.441us 13.441us volta_hgemm_128x128_tn
0.62% 12.384us 7 1.7690us 1.2480us 4.2560us singa::cuda::KernelFloat2Half(unsigned long, float const *, __half*)
0.39% 7.8720us 4 1.9680us 1.5680us 2.2400us void axpy_kernel_ref<__half, float>(cublasAxpyParamsRef<__half, __half, float>)
0.29% 5.7280us 4 1.4320us 1.2480us 1.5360us singa::cuda::KernelSub(unsigned long, float const *, float const *, float*)
0.28% 5.6320us 3 1.8770us 1.2480us 3.1040us singa::cuda::KernelMult(unsigned long, float const *, float, float*)
0.26% 5.2160us 2 2.6080us 1.6960us 3.5200us void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
0.23% 4.5440us 1 4.5440us 4.5440us 4.5440us void cudnn::detail::softmax_fw_kernel<int=2, __half, float, int=256, int=1, int=0, int=0>(cudnnTensorStruct, __half const *, cudnn::detail::softmax_fw_kernel<int=2, __half, float, int=256, int=1, int=0, int=0>, cudnnTensorStruct*, int, float, cudnnTensorStruct*, int, int)
0.22% 4.4480us 2 2.2240us 1.9200us 2.5280us void op_generic_tensor_kernel<int=1, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
0.22% 4.4160us 2 2.2080us 1.2800us 3.1360us singa::cuda::KernelAdd(unsigned long, float const *, float, float*)
0.19% 3.8410us 1 3.8410us 3.8410us 3.8410us void gemv2N_kernel<int, int, float, float, float, int=128, int=32, int=4, int=4, int=1, cublasGemvParams<cublasGemvTensor<float const >, cublasGemvTensor<float>, float>>(float const )
0.19% 3.8400us 1 3.8400us 3.8400us 3.8400us void op_generic_tensor_kernel<int=4, __half, float, __half, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, float, float, float, float, reducedDivisorArray, int)
0.17% 3.4240us 2 1.7120us 1.5680us 1.8560us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.16% 3.1360us 1 3.1360us 3.1360us 3.1360us void splitKreduce_kernel<__half, __half, __half>(cublasSplitKParams<__half>, __half const *, __half const *, __half*, __half const *, __half const *)
0.15% 3.0090us 2 1.5040us 1.2800us 1.7290us singa::cuda::KernelMult(unsigned long, __half const *, __half, __half*)
0.15% 2.9120us 1 2.9120us 2.9120us 2.9120us singa::cuda::KernelReLUBackward(unsigned long, __half const *, __half const *, __half*)
0.13% 2.4960us 1 2.4960us 2.4960us 2.4960us singa::cuda::KernelRelu(unsigned long, __half const *, __half*)
0.12% 2.4320us 1 2.4320us 2.4320us 2.4320us void reduce_1Block_kernel<float, int=128, int=7, cublasGemvTensorStridedBatched<float>, cublasGemvTensor<__half>>(float const *, float, float, int, float const *, float, cublasGemvTensorStridedBatched<float>, cublasPointerMode_t)
0.11% 2.2720us 1 2.2720us 2.2720us 2.2720us singa::cuda::KernelComputeCrossEntropy(bool, unsigned long, unsigned long, __half const *, int const *, __half*)
0.11% 2.2720us 1 2.2720us 2.2720us 2.2720us void dot_kernel<float, int=128, int=0, cublasDotParams<cublasGemvTensor<__half const >, cublasGemvTensorStridedBatched<float>>>(__half const )
0.11% 2.1120us 1 2.1120us 2.1120us 2.1120us void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
0.09% 1.7600us 1 1.7600us 1.7600us 1.7600us singa::cuda::KernelSoftmaxCrossEntropyBwd(bool, unsigned long, unsigned long, __half const *, int const *, __half*)
0.09% 1.7280us 1 1.7280us 1.7280us 1.7280us cudnn::gemm::computeWgradSplitKOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.08% 1.6650us 1 1.6650us 1.6650us 1.6650us [CUDA memcpy DtoH]
0.08% 1.6640us 1 1.6640us 1.6640us 1.6640us cudnn::gemm::computeWgradBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
0.08% 1.5680us 1 1.5680us 1.5680us 1.5680us singa::cuda::KernelHalf2Float(unsigned long, __half const *, float*)
0.08% 1.5040us 1 1.5040us 1.5040us 1.5040us singa::cuda::KernelCastFloat2Int(unsigned long, float const *, int*)
0.07% 1.4720us 1 1.4720us 1.4720us 1.4720us cudnn::gemm::computeBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
0.07% 1.3440us 1 1.3440us 1.3440us 1.3440us void scalePackedTensor_kernel<__half, float>(cudnnTensor4dStruct, __half*, float)
training used time 0.01540 sec
nvprof python examples/cnn/train_cnn_half.py -pfloat32
Time(%) Time Calls Avg Min Max Name
73.79% 1.1260ms 1 1.1260ms 1.1260ms 1.1260ms generate_seed_pseudo(__int64, __int64, __int64, curandOrdering, curandStateXORWOW*, unsigned int*)
5.29% 80.708us 1 80.708us 80.708us 80.708us void calc_bias_diff<int=2, float, float, int=128, int=0>(cudnnTensorStruct, float const *, cudnnTensorStruct, float*, float, float, int)
2.63% 40.099us 1 40.099us 40.099us 40.099us void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>*, kernel_grad_params, int, int, float, int, int, int)
2.28% 34.721us 3 11.573us 10.848us 12.800us void fft2d_r2c_32x32<float, bool=0, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
1.65% 25.121us 1 25.121us 25.121us 25.121us volta_gcgemm_32x32_nt
1.32% 20.160us 14 1.4400us 1.2800us 1.9520us void setTensor4d_kernel<float, float, int=16, int=16>(cudnnTensor4dStruct, float*, float)
1.29% 19.617us 9 2.1790us 1.1200us 10.017us [CUDA memcpy HtoD]
1.22% 18.594us 13 1.4300us 1.1840us 3.5210us [CUDA memcpy DtoD]
1.04% 15.905us 1 15.905us 15.905us 15.905us void fermiPlusCgemmLDS128_batched<bool=1, bool=0, bool=0, bool=0, int=4, int=4, int=4, int=3, int=3, bool=1, bool=0>(float2* const *, float2* const *, float2* const *, float2*, float2 const *, float2 const *, int, int, int, int, int, int, __int64, __int64, __int64, float2 const *, float2 const *, float2, float2, int)
0.78% 11.872us 1 11.872us 11.872us 11.872us void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=0, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
0.68% 10.369us 1 10.369us 10.369us 10.369us void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=1>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
0.63% 9.6320us 1 9.6320us 9.6320us 9.6320us volta_sgemm_32x32_sliced1x4_nn
0.62% 9.4720us 1 9.4720us 9.4720us 9.4720us void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
0.61% 9.2490us 1 9.2490us 9.2490us 9.2490us volta_sgemm_32x32_sliced1x4_nt
0.56% 8.4800us 5 1.6960us 1.2480us 3.2320us singa::cuda::KernelMult(unsigned long, float const *, float, float*)
0.51% 7.7760us 4 1.9440us 1.7280us 2.2720us [CUDA memset]
0.48% 7.3600us 4 1.8400us 1.4720us 2.0160us void axpy_kernel_ref<float, float>(cublasAxpyParamsRef<float, float, float>)
0.42% 6.4000us 1 6.4000us 6.4000us 6.4000us volta_sgemm_128x32_tn
0.38% 5.7280us 4 1.4320us 1.2800us 1.6640us singa::cuda::KernelSub(unsigned long, float const *, float const *, float*)
0.36% 5.4400us 2 2.7200us 1.7920us 3.6480us void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
0.29% 4.3850us 2 2.1920us 1.2480us 3.1370us singa::cuda::KernelAdd(unsigned long, float const *, float, float*)
0.27% 4.1600us 1 4.1600us 4.1600us 4.1600us void flip_filter<float, float>(float*, float const *, int, int, int, int)
0.26% 4.0320us 2 2.0160us 2.0160us 2.0160us void op_generic_tensor_kernel<int=1, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
0.25% 3.8730us 1 3.8730us 3.8730us 3.8730us void gemv2N_kernel<int, int, float, float, float, int=128, int=32, int=4, int=4, int=1, cublasGemvParams<cublasGemvTensor<float const >, cublasGemvTensor<float>, float>>(float const )
0.25% 3.8400us 1 3.8400us 3.8400us 3.8400us void op_generic_tensor_kernel<int=4, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
0.25% 3.7770us 1 3.7770us 3.7770us 3.7770us void cudnn::detail::softmax_fw_kernel_resident<int=2, float, float, int=256, int=1, int=0, int=0, int=32, int=0>(cudnnTensorStruct, float const *, cudnn::detail::softmax_fw_kernel_resident<int=2, float, float, int=256, int=1, int=0, int=0, int=32, int=0>, float*, int, float, float*, int, int)
0.20% 3.1040us 1 3.1040us 3.1040us 3.1040us singa::cuda::KernelReLUBackward(unsigned long, float const *, float const *, float*)
0.20% 3.0400us 1 3.0400us 3.0400us 3.0400us void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const *, float const *, float*, float const *, float const *)
0.17% 2.6240us 1 2.6240us 2.6240us 2.6240us void gemmk1_kernel<float, int=256, int=5, bool=0, bool=0, bool=0, bool=0, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>, float>(cublasGemmk1Params<float, float const , cublasGemvTensorStridedBatched<float const >, float>)
0.17% 2.5600us 1 2.5600us 2.5600us 2.5600us singa::cuda::KernelRelu(unsigned long, float const *, float*)
0.16% 2.4960us 1 2.4960us 2.4960us 2.4960us void reduce_1Block_kernel<float, int=128, int=7, cublasGemvTensorStridedBatched<float>, cublasGemvTensor<float>>(float const *, float, float, int, float const *, float, cublasGemvTensorStridedBatched<float>, cublasPointerMode_t)
0.15% 2.2720us 1 2.2720us 2.2720us 2.2720us void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
0.14% 2.1770us 1 2.1770us 2.1770us 2.1770us void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
0.14% 2.1760us 1 2.1760us 2.1760us 2.1760us void dot_kernel<float, int=128, int=0, cublasDotParams<cublasGemvTensor<float const >, cublasGemvTensorStridedBatched<float>>>(float const )
0.12% 1.8880us 1 1.8880us 1.8880us 1.8880us singa::cuda::KernelComputeCrossEntropy(bool, unsigned long, unsigned long, float const *, int const *, float*)
0.12% 1.8240us 1 1.8240us 1.8240us 1.8240us compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
0.12% 1.8240us 1 1.8240us 1.8240us 1.8240us singa::cuda::KernelSoftmaxCrossEntropyBwd(bool, unsigned long, unsigned long, float const *, int const *, float*)
0.11% 1.6320us 1 1.6320us 1.6320us 1.6320us [CUDA memcpy DtoH]
0.10% 1.5040us 1 1.5040us 1.5040us 1.5040us singa::cuda::KernelCastFloat2Int(unsigned long, float const *, int*)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
training used time 0.02451 sec
nvprof python torch-examples/one_layer.py --opt-level O0