Last active
October 4, 2020 15:23
-
-
Save dcslin/aa90e5258724cb9add9e9d67ae43e5fd to your computer and use it in GitHub Desktop.
conv+relu+fc fp32 fp16
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pass |
training used time 0.01820 sec
python examples/cnn/train_cnn_half.py -pfloat16
Time(%) Time Calls Avg Min Max Name
57.18% 1.1399ms 1 1.1399ms 1.1399ms 1.1399ms generate_seed_pseudo(__int64, __int64, __int64, curandOrdering, curandStateXORWOW*, unsigned int*)
17.24% 343.60us 1 343.60us 343.60us 343.60us volta_hcudnn_128x128_stridedB_splitK_interior_nn_v1
5.10% 101.73us 1 101.73us 101.73us 101.73us volta_h884cudnn_256x128_ldg8_dgrad_exp_small_nhwc_tt_v1
4.06% 80.868us 1 80.868us 80.868us 80.868us void calc_bias_diff<int=2, __half, float, int=128, int=0>(cudnnTensorStruct, __half const *, cudnnTensorStruct, __half*, float, float, int)
3.53% 70.436us 1 70.436us 70.436us 70.436us turing_h1688cudnn_256x64_sliced1x2_ldg8_relu_exp_interior_nhwc_tn_v1
1.27% 25.410us 2 12.705us 8.9290us 16.481us volta_hgemm_128x64_nn
1.27% 25.314us 4 6.3280us 1.9200us 11.905us void nchwToNhwcKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
0.98% 19.585us 2 9.7920us 9.6010us 9.9840us void nhwcToNchwKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
0.85% 17.025us 12 1.4180us 1.2800us 1.7600us void setTensor4d_kernel<float, float, int=16, int=16>(cudnnTensor4dStruct, float*, float)
0.84% 16.833us 1 16.833us 16.833us 16.833us volta_hgemm_128x128_nt
0.78% 15.456us 11 1.4050us 1.1200us 3.3600us [CUDA memcpy HtoD]
0.75% 14.880us 9 1.6530us 1.1520us 2.2400us [CUDA memset]
0.72% 14.370us 11 1.3060us 1.2160us 1.5680us [CUDA memcpy DtoD]
0.67% 13.441us 1 13.441us 13.441us 13.441us volta_hgemm_128x128_tn
0.62% 12.384us 7 1.7690us 1.2480us 4.2560us singa::cuda::KernelFloat2Half(unsigned long, float const *, __half*)
0.39% 7.8720us 4 1.9680us 1.5680us 2.2400us void axpy_kernel_ref<__half, float>(cublasAxpyParamsRef<__half, __half, float>)
0.29% 5.7280us 4 1.4320us 1.2480us 1.5360us singa::cuda::KernelSub(unsigned long, float const *, float const *, float*)
0.28% 5.6320us 3 1.8770us 1.2480us 3.1040us singa::cuda::KernelMult(unsigned long, float const *, float, float*)
0.26% 5.2160us 2 2.6080us 1.6960us 3.5200us void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
0.23% 4.5440us 1 4.5440us 4.5440us 4.5440us void cudnn::detail::softmax_fw_kernel<int=2, __half, float, int=256, int=1, int=0, int=0>(cudnnTensorStruct, __half const *, cudnn::detail::softmax_fw_kernel<int=2, __half, float, int=256, int=1, int=0, int=0>, cudnnTensorStruct*, int, float, cudnnTensorStruct*, int, int)
0.22% 4.4480us 2 2.2240us 1.9200us 2.5280us void op_generic_tensor_kernel<int=1, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
0.22% 4.4160us 2 2.2080us 1.2800us 3.1360us singa::cuda::KernelAdd(unsigned long, float const *, float, float*)
0.19% 3.8410us 1 3.8410us 3.8410us 3.8410us void gemv2N_kernel<int, int, float, float, float, int=128, int=32, int=4, int=4, int=1, cublasGemvParams<cublasGemvTensor<float const >, cublasGemvTensor<float>, float>>(float const )
0.19% 3.8400us 1 3.8400us 3.8400us 3.8400us void op_generic_tensor_kernel<int=4, __half, float, __half, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, float, float, float, float, reducedDivisorArray, int)
0.17% 3.4240us 2 1.7120us 1.5680us 1.8560us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.16% 3.1360us 1 3.1360us 3.1360us 3.1360us void splitKreduce_kernel<__half, __half, __half>(cublasSplitKParams<__half>, __half const *, __half const *, __half*, __half const *, __half const *)
0.15% 3.0090us 2 1.5040us 1.2800us 1.7290us singa::cuda::KernelMult(unsigned long, __half const *, __half, __half*)
0.15% 2.9120us 1 2.9120us 2.9120us 2.9120us singa::cuda::KernelReLUBackward(unsigned long, __half const *, __half const *, __half*)
0.13% 2.4960us 1 2.4960us 2.4960us 2.4960us singa::cuda::KernelRelu(unsigned long, __half const *, __half*)
0.12% 2.4320us 1 2.4320us 2.4320us 2.4320us void reduce_1Block_kernel<float, int=128, int=7, cublasGemvTensorStridedBatched<float>, cublasGemvTensor<__half>>(float const *, float, float, int, float const *, float, cublasGemvTensorStridedBatched<float>, cublasPointerMode_t)
0.11% 2.2720us 1 2.2720us 2.2720us 2.2720us singa::cuda::KernelComputeCrossEntropy(bool, unsigned long, unsigned long, __half const *, int const *, __half*)
0.11% 2.2720us 1 2.2720us 2.2720us 2.2720us void dot_kernel<float, int=128, int=0, cublasDotParams<cublasGemvTensor<__half const >, cublasGemvTensorStridedBatched<float>>>(__half const )
0.11% 2.1120us 1 2.1120us 2.1120us 2.1120us void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
0.09% 1.7600us 1 1.7600us 1.7600us 1.7600us singa::cuda::KernelSoftmaxCrossEntropyBwd(bool, unsigned long, unsigned long, __half const *, int const *, __half*)
0.09% 1.7280us 1 1.7280us 1.7280us 1.7280us cudnn::gemm::computeWgradSplitKOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.08% 1.6650us 1 1.6650us 1.6650us 1.6650us [CUDA memcpy DtoH]
0.08% 1.6640us 1 1.6640us 1.6640us 1.6640us cudnn::gemm::computeWgradBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
0.08% 1.5680us 1 1.5680us 1.5680us 1.5680us singa::cuda::KernelHalf2Float(unsigned long, __half const *, float*)
0.08% 1.5040us 1 1.5040us 1.5040us 1.5040us singa::cuda::KernelCastFloat2Int(unsigned long, float const *, int*)
0.07% 1.4720us 1 1.4720us 1.4720us 1.4720us cudnn::gemm::computeBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
0.07% 1.3440us 1 1.3440us 1.3440us 1.3440us void scalePackedTensor_kernel<__half, float>(cudnnTensor4dStruct, __half*, float)
training used time 0.01540 sec
nvprof python examples/cnn/train_cnn_half.py -pfloat32
Time(%) Time Calls Avg Min Max Name
73.79% 1.1260ms 1 1.1260ms 1.1260ms 1.1260ms generate_seed_pseudo(__int64, __int64, __int64, curandOrdering, curandStateXORWOW*, unsigned int*)
5.29% 80.708us 1 80.708us 80.708us 80.708us void calc_bias_diff<int=2, float, float, int=128, int=0>(cudnnTensorStruct, float const *, cudnnTensorStruct, float*, float, float, int)
2.63% 40.099us 1 40.099us 40.099us 40.099us void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>*, kernel_grad_params, int, int, float, int, int, int)
2.28% 34.721us 3 11.573us 10.848us 12.800us void fft2d_r2c_32x32<float, bool=0, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
1.65% 25.121us 1 25.121us 25.121us 25.121us volta_gcgemm_32x32_nt
1.32% 20.160us 14 1.4400us 1.2800us 1.9520us void setTensor4d_kernel<float, float, int=16, int=16>(cudnnTensor4dStruct, float*, float)
1.29% 19.617us 9 2.1790us 1.1200us 10.017us [CUDA memcpy HtoD]
1.22% 18.594us 13 1.4300us 1.1840us 3.5210us [CUDA memcpy DtoD]
1.04% 15.905us 1 15.905us 15.905us 15.905us void fermiPlusCgemmLDS128_batched<bool=1, bool=0, bool=0, bool=0, int=4, int=4, int=4, int=3, int=3, bool=1, bool=0>(float2* const *, float2* const *, float2* const *, float2*, float2 const *, float2 const *, int, int, int, int, int, int, __int64, __int64, __int64, float2 const *, float2 const *, float2, float2, int)
0.78% 11.872us 1 11.872us 11.872us 11.872us void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=0, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
0.68% 10.369us 1 10.369us 10.369us 10.369us void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=1>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
0.63% 9.6320us 1 9.6320us 9.6320us 9.6320us volta_sgemm_32x32_sliced1x4_nn
0.62% 9.4720us 1 9.4720us 9.4720us 9.4720us void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
0.61% 9.2490us 1 9.2490us 9.2490us 9.2490us volta_sgemm_32x32_sliced1x4_nt
0.56% 8.4800us 5 1.6960us 1.2480us 3.2320us singa::cuda::KernelMult(unsigned long, float const *, float, float*)
0.51% 7.7760us 4 1.9440us 1.7280us 2.2720us [CUDA memset]
0.48% 7.3600us 4 1.8400us 1.4720us 2.0160us void axpy_kernel_ref<float, float>(cublasAxpyParamsRef<float, float, float>)
0.42% 6.4000us 1 6.4000us 6.4000us 6.4000us volta_sgemm_128x32_tn
0.38% 5.7280us 4 1.4320us 1.2800us 1.6640us singa::cuda::KernelSub(unsigned long, float const *, float const *, float*)
0.36% 5.4400us 2 2.7200us 1.7920us 3.6480us void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
0.29% 4.3850us 2 2.1920us 1.2480us 3.1370us singa::cuda::KernelAdd(unsigned long, float const *, float, float*)
0.27% 4.1600us 1 4.1600us 4.1600us 4.1600us void flip_filter<float, float>(float*, float const *, int, int, int, int)
0.26% 4.0320us 2 2.0160us 2.0160us 2.0160us void op_generic_tensor_kernel<int=1, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
0.25% 3.8730us 1 3.8730us 3.8730us 3.8730us void gemv2N_kernel<int, int, float, float, float, int=128, int=32, int=4, int=4, int=1, cublasGemvParams<cublasGemvTensor<float const >, cublasGemvTensor<float>, float>>(float const )
0.25% 3.8400us 1 3.8400us 3.8400us 3.8400us void op_generic_tensor_kernel<int=4, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
0.25% 3.7770us 1 3.7770us 3.7770us 3.7770us void cudnn::detail::softmax_fw_kernel_resident<int=2, float, float, int=256, int=1, int=0, int=0, int=32, int=0>(cudnnTensorStruct, float const *, cudnn::detail::softmax_fw_kernel_resident<int=2, float, float, int=256, int=1, int=0, int=0, int=32, int=0>, float*, int, float, float*, int, int)
0.20% 3.1040us 1 3.1040us 3.1040us 3.1040us singa::cuda::KernelReLUBackward(unsigned long, float const *, float const *, float*)
0.20% 3.0400us 1 3.0400us 3.0400us 3.0400us void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const *, float const *, float*, float const *, float const *)
0.17% 2.6240us 1 2.6240us 2.6240us 2.6240us void gemmk1_kernel<float, int=256, int=5, bool=0, bool=0, bool=0, bool=0, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>, float>(cublasGemmk1Params<float, float const , cublasGemvTensorStridedBatched<float const >, float>)
0.17% 2.5600us 1 2.5600us 2.5600us 2.5600us singa::cuda::KernelRelu(unsigned long, float const *, float*)
0.16% 2.4960us 1 2.4960us 2.4960us 2.4960us void reduce_1Block_kernel<float, int=128, int=7, cublasGemvTensorStridedBatched<float>, cublasGemvTensor<float>>(float const *, float, float, int, float const *, float, cublasGemvTensorStridedBatched<float>, cublasPointerMode_t)
0.15% 2.2720us 1 2.2720us 2.2720us 2.2720us void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
0.14% 2.1770us 1 2.1770us 2.1770us 2.1770us void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
0.14% 2.1760us 1 2.1760us 2.1760us 2.1760us void dot_kernel<float, int=128, int=0, cublasDotParams<cublasGemvTensor<float const >, cublasGemvTensorStridedBatched<float>>>(float const )
0.12% 1.8880us 1 1.8880us 1.8880us 1.8880us singa::cuda::KernelComputeCrossEntropy(bool, unsigned long, unsigned long, float const *, int const *, float*)
0.12% 1.8240us 1 1.8240us 1.8240us 1.8240us compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
0.12% 1.8240us 1 1.8240us 1.8240us 1.8240us singa::cuda::KernelSoftmaxCrossEntropyBwd(bool, unsigned long, unsigned long, float const *, int const *, float*)
0.11% 1.6320us 1 1.6320us 1.6320us 1.6320us [CUDA memcpy DtoH]
0.10% 1.5040us 1 1.5040us 1.5040us 1.5040us singa::cuda::KernelCastFloat2Int(unsigned long, float const *, int*)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
nvprof python torch-examples/one_layer.py --opt-level O3
training used time 0.02622 sec