Skip to content

Instantly share code, notes, and snippets.

@dcslin
Last active October 4, 2020 15:23
Show Gist options
  • Save dcslin/aa90e5258724cb9add9e9d67ae43e5fd to your computer and use it in GitHub Desktop.
Save dcslin/aa90e5258724cb9add9e9d67ae43e5fd to your computer and use it in GitHub Desktop.
conv+relu+fc fp32 fp16
@dcslin
Copy link
Author

dcslin commented Oct 4, 2020

training used time 0.01540 sec
nvprof python examples/cnn/train_cnn_half.py -pfloat32

Time(%)      Time     Calls       Avg       Min       Max  Name
 73.79%  1.1260ms         1  1.1260ms  1.1260ms  1.1260ms  generate_seed_pseudo(__int64, __int64, __int64, curandOrdering, curandStateXORWOW*, unsigned int*)
  5.29%  80.708us         1  80.708us  80.708us  80.708us  void calc_bias_diff<int=2, float, float, int=128, int=0>(cudnnTensorStruct, float const *, cudnnTensorStruct, float*, float, float, int)
  2.63%  40.099us         1  40.099us  40.099us  40.099us  void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>*, kernel_grad_params, int, int, float, int, int, int)
  2.28%  34.721us         3  11.573us  10.848us  12.800us  void fft2d_r2c_32x32<float, bool=0, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  1.65%  25.121us         1  25.121us  25.121us  25.121us  volta_gcgemm_32x32_nt
  1.32%  20.160us        14  1.4400us  1.2800us  1.9520us  void setTensor4d_kernel<float, float, int=16, int=16>(cudnnTensor4dStruct, float*, float)
  1.29%  19.617us         9  2.1790us  1.1200us  10.017us  [CUDA memcpy HtoD]
  1.22%  18.594us        13  1.4300us  1.1840us  3.5210us  [CUDA memcpy DtoD]
  1.04%  15.905us         1  15.905us  15.905us  15.905us  void fermiPlusCgemmLDS128_batched<bool=1, bool=0, bool=0, bool=0, int=4, int=4, int=4, int=3, int=3, bool=1, bool=0>(float2* const *, float2* const *, float2* const *, float2*, float2 const *, float2 const *, int, int, int, int, int, int, __int64, __int64, __int64, float2 const *, float2 const *, float2, float2, int)
  0.78%  11.872us         1  11.872us  11.872us  11.872us  void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=0, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
  0.68%  10.369us         1  10.369us  10.369us  10.369us  void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=1>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  0.63%  9.6320us         1  9.6320us  9.6320us  9.6320us  volta_sgemm_32x32_sliced1x4_nn
  0.62%  9.4720us         1  9.4720us  9.4720us  9.4720us  void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
  0.61%  9.2490us         1  9.2490us  9.2490us  9.2490us  volta_sgemm_32x32_sliced1x4_nt
  0.56%  8.4800us         5  1.6960us  1.2480us  3.2320us  singa::cuda::KernelMult(unsigned long, float const *, float, float*)
  0.51%  7.7760us         4  1.9440us  1.7280us  2.2720us  [CUDA memset]
  0.48%  7.3600us         4  1.8400us  1.4720us  2.0160us  void axpy_kernel_ref<float, float>(cublasAxpyParamsRef<float, float, float>)
  0.42%  6.4000us         1  6.4000us  6.4000us  6.4000us  volta_sgemm_128x32_tn
  0.38%  5.7280us         4  1.4320us  1.2800us  1.6640us  singa::cuda::KernelSub(unsigned long, float const *, float const *, float*)
  0.36%  5.4400us         2  2.7200us  1.7920us  3.6480us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
  0.29%  4.3850us         2  2.1920us  1.2480us  3.1370us  singa::cuda::KernelAdd(unsigned long, float const *, float, float*)
  0.27%  4.1600us         1  4.1600us  4.1600us  4.1600us  void flip_filter<float, float>(float*, float const *, int, int, int, int)
  0.26%  4.0320us         2  2.0160us  2.0160us  2.0160us  void op_generic_tensor_kernel<int=1, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
  0.25%  3.8730us         1  3.8730us  3.8730us  3.8730us  void gemv2N_kernel<int, int, float, float, float, int=128, int=32, int=4, int=4, int=1, cublasGemvParams<cublasGemvTensor<float const >, cublasGemvTensor<float>, float>>(float const )
  0.25%  3.8400us         1  3.8400us  3.8400us  3.8400us  void op_generic_tensor_kernel<int=4, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
  0.25%  3.7770us         1  3.7770us  3.7770us  3.7770us  void cudnn::detail::softmax_fw_kernel_resident<int=2, float, float, int=256, int=1, int=0, int=0, int=32, int=0>(cudnnTensorStruct, float const *, cudnn::detail::softmax_fw_kernel_resident<int=2, float, float, int=256, int=1, int=0, int=0, int=32, int=0>, float*, int, float, float*, int, int)
  0.20%  3.1040us         1  3.1040us  3.1040us  3.1040us  singa::cuda::KernelReLUBackward(unsigned long, float const *, float const *, float*)
  0.20%  3.0400us         1  3.0400us  3.0400us  3.0400us  void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const *, float const *, float*, float const *, float const *)
  0.17%  2.6240us         1  2.6240us  2.6240us  2.6240us  void gemmk1_kernel<float, int=256, int=5, bool=0, bool=0, bool=0, bool=0, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>, float>(cublasGemmk1Params<float, float const , cublasGemvTensorStridedBatched<float const >, float>)
  0.17%  2.5600us         1  2.5600us  2.5600us  2.5600us  singa::cuda::KernelRelu(unsigned long, float const *, float*)
  0.16%  2.4960us         1  2.4960us  2.4960us  2.4960us  void reduce_1Block_kernel<float, int=128, int=7, cublasGemvTensorStridedBatched<float>, cublasGemvTensor<float>>(float const *, float, float, int, float const *, float, cublasGemvTensorStridedBatched<float>, cublasPointerMode_t)
  0.15%  2.2720us         1  2.2720us  2.2720us  2.2720us  void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
  0.14%  2.1770us         1  2.1770us  2.1770us  2.1770us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
  0.14%  2.1760us         1  2.1760us  2.1760us  2.1760us  void dot_kernel<float, int=128, int=0, cublasDotParams<cublasGemvTensor<float const >, cublasGemvTensorStridedBatched<float>>>(float const )
  0.12%  1.8880us         1  1.8880us  1.8880us  1.8880us  singa::cuda::KernelComputeCrossEntropy(bool, unsigned long, unsigned long, float const *, int const *, float*)
  0.12%  1.8240us         1  1.8240us  1.8240us  1.8240us  compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
  0.12%  1.8240us         1  1.8240us  1.8240us  1.8240us  singa::cuda::KernelSoftmaxCrossEntropyBwd(bool, unsigned long, unsigned long, float const *, int const *, float*)
  0.11%  1.6320us         1  1.6320us  1.6320us  1.6320us  [CUDA memcpy DtoH]
  0.10%  1.5040us         1  1.5040us  1.5040us  1.5040us  singa::cuda::KernelCastFloat2Int(unsigned long, float const *, int*)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment