Last active
October 4, 2020 15:23
-
-
Save dcslin/aa90e5258724cb9add9e9d67ae43e5fd to your computer and use it in GitHub Desktop.
conv+relu+fc fp32 fp16
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pass |
training used time 0.01540 sec
nvprof python examples/cnn/train_cnn_half.py -pfloat32
Time(%) Time Calls Avg Min Max Name
73.79% 1.1260ms 1 1.1260ms 1.1260ms 1.1260ms generate_seed_pseudo(__int64, __int64, __int64, curandOrdering, curandStateXORWOW*, unsigned int*)
5.29% 80.708us 1 80.708us 80.708us 80.708us void calc_bias_diff<int=2, float, float, int=128, int=0>(cudnnTensorStruct, float const *, cudnnTensorStruct, float*, float, float, int)
2.63% 40.099us 1 40.099us 40.099us 40.099us void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>*, kernel_grad_params, int, int, float, int, int, int)
2.28% 34.721us 3 11.573us 10.848us 12.800us void fft2d_r2c_32x32<float, bool=0, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
1.65% 25.121us 1 25.121us 25.121us 25.121us volta_gcgemm_32x32_nt
1.32% 20.160us 14 1.4400us 1.2800us 1.9520us void setTensor4d_kernel<float, float, int=16, int=16>(cudnnTensor4dStruct, float*, float)
1.29% 19.617us 9 2.1790us 1.1200us 10.017us [CUDA memcpy HtoD]
1.22% 18.594us 13 1.4300us 1.1840us 3.5210us [CUDA memcpy DtoD]
1.04% 15.905us 1 15.905us 15.905us 15.905us void fermiPlusCgemmLDS128_batched<bool=1, bool=0, bool=0, bool=0, int=4, int=4, int=4, int=3, int=3, bool=1, bool=0>(float2* const *, float2* const *, float2* const *, float2*, float2 const *, float2 const *, int, int, int, int, int, int, __int64, __int64, __int64, float2 const *, float2 const *, float2, float2, int)
0.78% 11.872us 1 11.872us 11.872us 11.872us void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=0, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
0.68% 10.369us 1 10.369us 10.369us 10.369us void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=1>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
0.63% 9.6320us 1 9.6320us 9.6320us 9.6320us volta_sgemm_32x32_sliced1x4_nn
0.62% 9.4720us 1 9.4720us 9.4720us 9.4720us void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
0.61% 9.2490us 1 9.2490us 9.2490us 9.2490us volta_sgemm_32x32_sliced1x4_nt
0.56% 8.4800us 5 1.6960us 1.2480us 3.2320us singa::cuda::KernelMult(unsigned long, float const *, float, float*)
0.51% 7.7760us 4 1.9440us 1.7280us 2.2720us [CUDA memset]
0.48% 7.3600us 4 1.8400us 1.4720us 2.0160us void axpy_kernel_ref<float, float>(cublasAxpyParamsRef<float, float, float>)
0.42% 6.4000us 1 6.4000us 6.4000us 6.4000us volta_sgemm_128x32_tn
0.38% 5.7280us 4 1.4320us 1.2800us 1.6640us singa::cuda::KernelSub(unsigned long, float const *, float const *, float*)
0.36% 5.4400us 2 2.7200us 1.7920us 3.6480us void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
0.29% 4.3850us 2 2.1920us 1.2480us 3.1370us singa::cuda::KernelAdd(unsigned long, float const *, float, float*)
0.27% 4.1600us 1 4.1600us 4.1600us 4.1600us void flip_filter<float, float>(float*, float const *, int, int, int, int)
0.26% 4.0320us 2 2.0160us 2.0160us 2.0160us void op_generic_tensor_kernel<int=1, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
0.25% 3.8730us 1 3.8730us 3.8730us 3.8730us void gemv2N_kernel<int, int, float, float, float, int=128, int=32, int=4, int=4, int=1, cublasGemvParams<cublasGemvTensor<float const >, cublasGemvTensor<float>, float>>(float const )
0.25% 3.8400us 1 3.8400us 3.8400us 3.8400us void op_generic_tensor_kernel<int=4, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
0.25% 3.7770us 1 3.7770us 3.7770us 3.7770us void cudnn::detail::softmax_fw_kernel_resident<int=2, float, float, int=256, int=1, int=0, int=0, int=32, int=0>(cudnnTensorStruct, float const *, cudnn::detail::softmax_fw_kernel_resident<int=2, float, float, int=256, int=1, int=0, int=0, int=32, int=0>, float*, int, float, float*, int, int)
0.20% 3.1040us 1 3.1040us 3.1040us 3.1040us singa::cuda::KernelReLUBackward(unsigned long, float const *, float const *, float*)
0.20% 3.0400us 1 3.0400us 3.0400us 3.0400us void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const *, float const *, float*, float const *, float const *)
0.17% 2.6240us 1 2.6240us 2.6240us 2.6240us void gemmk1_kernel<float, int=256, int=5, bool=0, bool=0, bool=0, bool=0, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>, float>(cublasGemmk1Params<float, float const , cublasGemvTensorStridedBatched<float const >, float>)
0.17% 2.5600us 1 2.5600us 2.5600us 2.5600us singa::cuda::KernelRelu(unsigned long, float const *, float*)
0.16% 2.4960us 1 2.4960us 2.4960us 2.4960us void reduce_1Block_kernel<float, int=128, int=7, cublasGemvTensorStridedBatched<float>, cublasGemvTensor<float>>(float const *, float, float, int, float const *, float, cublasGemvTensorStridedBatched<float>, cublasPointerMode_t)
0.15% 2.2720us 1 2.2720us 2.2720us 2.2720us void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
0.14% 2.1770us 1 2.1770us 2.1770us 2.1770us void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
0.14% 2.1760us 1 2.1760us 2.1760us 2.1760us void dot_kernel<float, int=128, int=0, cublasDotParams<cublasGemvTensor<float const >, cublasGemvTensorStridedBatched<float>>>(float const )
0.12% 1.8880us 1 1.8880us 1.8880us 1.8880us singa::cuda::KernelComputeCrossEntropy(bool, unsigned long, unsigned long, float const *, int const *, float*)
0.12% 1.8240us 1 1.8240us 1.8240us 1.8240us compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
0.12% 1.8240us 1 1.8240us 1.8240us 1.8240us singa::cuda::KernelSoftmaxCrossEntropyBwd(bool, unsigned long, unsigned long, float const *, int const *, float*)
0.11% 1.6320us 1 1.6320us 1.6320us 1.6320us [CUDA memcpy DtoH]
0.10% 1.5040us 1 1.5040us 1.5040us 1.5040us singa::cuda::KernelCastFloat2Int(unsigned long, float const *, int*)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
training used time 0.01820 sec
python examples/cnn/train_cnn_half.py -pfloat16