Skip to content

Instantly share code, notes, and snippets.

@dcslin
Last active October 4, 2020 15:23
Show Gist options
  • Save dcslin/aa90e5258724cb9add9e9d67ae43e5fd to your computer and use it in GitHub Desktop.
Save dcslin/aa90e5258724cb9add9e9d67ae43e5fd to your computer and use it in GitHub Desktop.
conv+relu+fc fp32 fp16
@dcslin
Copy link
Author

dcslin commented Oct 4, 2020

training used time 0.01820 sec
python examples/cnn/train_cnn_half.py -pfloat16

Time(%)      Time     Calls       Avg       Min       Max  Name
 57.18%  1.1399ms         1  1.1399ms  1.1399ms  1.1399ms  generate_seed_pseudo(__int64, __int64, __int64, curandOrdering, curandStateXORWOW*, unsigned int*)
 17.24%  343.60us         1  343.60us  343.60us  343.60us  volta_hcudnn_128x128_stridedB_splitK_interior_nn_v1
  5.10%  101.73us         1  101.73us  101.73us  101.73us  volta_h884cudnn_256x128_ldg8_dgrad_exp_small_nhwc_tt_v1
  4.06%  80.868us         1  80.868us  80.868us  80.868us  void calc_bias_diff<int=2, __half, float, int=128, int=0>(cudnnTensorStruct, __half const *, cudnnTensorStruct, __half*, float, float, int)
  3.53%  70.436us         1  70.436us  70.436us  70.436us  turing_h1688cudnn_256x64_sliced1x2_ldg8_relu_exp_interior_nhwc_tn_v1
  1.27%  25.410us         2  12.705us  8.9290us  16.481us  volta_hgemm_128x64_nn
  1.27%  25.314us         4  6.3280us  1.9200us  11.905us  void nchwToNhwcKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
  0.98%  19.585us         2  9.7920us  9.6010us  9.9840us  void nhwcToNchwKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
  0.85%  17.025us        12  1.4180us  1.2800us  1.7600us  void setTensor4d_kernel<float, float, int=16, int=16>(cudnnTensor4dStruct, float*, float)
  0.84%  16.833us         1  16.833us  16.833us  16.833us  volta_hgemm_128x128_nt
  0.78%  15.456us        11  1.4050us  1.1200us  3.3600us  [CUDA memcpy HtoD]
  0.75%  14.880us         9  1.6530us  1.1520us  2.2400us  [CUDA memset]
  0.72%  14.370us        11  1.3060us  1.2160us  1.5680us  [CUDA memcpy DtoD]
  0.67%  13.441us         1  13.441us  13.441us  13.441us  volta_hgemm_128x128_tn
  0.62%  12.384us         7  1.7690us  1.2480us  4.2560us  singa::cuda::KernelFloat2Half(unsigned long, float const *, __half*)
  0.39%  7.8720us         4  1.9680us  1.5680us  2.2400us  void axpy_kernel_ref<__half, float>(cublasAxpyParamsRef<__half, __half, float>)
  0.29%  5.7280us         4  1.4320us  1.2480us  1.5360us  singa::cuda::KernelSub(unsigned long, float const *, float const *, float*)
  0.28%  5.6320us         3  1.8770us  1.2480us  3.1040us  singa::cuda::KernelMult(unsigned long, float const *, float, float*)
  0.26%  5.2160us         2  2.6080us  1.6960us  3.5200us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
  0.23%  4.5440us         1  4.5440us  4.5440us  4.5440us  void cudnn::detail::softmax_fw_kernel<int=2, __half, float, int=256, int=1, int=0, int=0>(cudnnTensorStruct, __half const *, cudnn::detail::softmax_fw_kernel<int=2, __half, float, int=256, int=1, int=0, int=0>, cudnnTensorStruct*, int, float, cudnnTensorStruct*, int, int)
  0.22%  4.4480us         2  2.2240us  1.9200us  2.5280us  void op_generic_tensor_kernel<int=1, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
  0.22%  4.4160us         2  2.2080us  1.2800us  3.1360us  singa::cuda::KernelAdd(unsigned long, float const *, float, float*)
  0.19%  3.8410us         1  3.8410us  3.8410us  3.8410us  void gemv2N_kernel<int, int, float, float, float, int=128, int=32, int=4, int=4, int=1, cublasGemvParams<cublasGemvTensor<float const >, cublasGemvTensor<float>, float>>(float const )
  0.19%  3.8400us         1  3.8400us  3.8400us  3.8400us  void op_generic_tensor_kernel<int=4, __half, float, __half, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, float, float, float, float, reducedDivisorArray, int)
  0.17%  3.4240us         2  1.7120us  1.5680us  1.8560us  cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
  0.16%  3.1360us         1  3.1360us  3.1360us  3.1360us  void splitKreduce_kernel<__half, __half, __half>(cublasSplitKParams<__half>, __half const *, __half const *, __half*, __half const *, __half const *)
  0.15%  3.0090us         2  1.5040us  1.2800us  1.7290us  singa::cuda::KernelMult(unsigned long, __half const *, __half, __half*)
  0.15%  2.9120us         1  2.9120us  2.9120us  2.9120us  singa::cuda::KernelReLUBackward(unsigned long, __half const *, __half const *, __half*)
  0.13%  2.4960us         1  2.4960us  2.4960us  2.4960us  singa::cuda::KernelRelu(unsigned long, __half const *, __half*)
  0.12%  2.4320us         1  2.4320us  2.4320us  2.4320us  void reduce_1Block_kernel<float, int=128, int=7, cublasGemvTensorStridedBatched<float>, cublasGemvTensor<__half>>(float const *, float, float, int, float const *, float, cublasGemvTensorStridedBatched<float>, cublasPointerMode_t)
  0.11%  2.2720us         1  2.2720us  2.2720us  2.2720us  singa::cuda::KernelComputeCrossEntropy(bool, unsigned long, unsigned long, __half const *, int const *, __half*)
  0.11%  2.2720us         1  2.2720us  2.2720us  2.2720us  void dot_kernel<float, int=128, int=0, cublasDotParams<cublasGemvTensor<__half const >, cublasGemvTensorStridedBatched<float>>>(__half const )
  0.11%  2.1120us         1  2.1120us  2.1120us  2.1120us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
  0.09%  1.7600us         1  1.7600us  1.7600us  1.7600us  singa::cuda::KernelSoftmaxCrossEntropyBwd(bool, unsigned long, unsigned long, __half const *, int const *, __half*)
  0.09%  1.7280us         1  1.7280us  1.7280us  1.7280us  cudnn::gemm::computeWgradSplitKOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
  0.08%  1.6650us         1  1.6650us  1.6650us  1.6650us  [CUDA memcpy DtoH]
  0.08%  1.6640us         1  1.6640us  1.6640us  1.6640us  cudnn::gemm::computeWgradBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
  0.08%  1.5680us         1  1.5680us  1.5680us  1.5680us  singa::cuda::KernelHalf2Float(unsigned long, __half const *, float*)
  0.08%  1.5040us         1  1.5040us  1.5040us  1.5040us  singa::cuda::KernelCastFloat2Int(unsigned long, float const *, int*)
  0.07%  1.4720us         1  1.4720us  1.4720us  1.4720us  cudnn::gemm::computeBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
  0.07%  1.3440us         1  1.3440us  1.3440us  1.3440us  void scalePackedTensor_kernel<__half, float>(cudnnTensor4dStruct, __half*, float)

@dcslin
Copy link
Author

dcslin commented Oct 4, 2020

training used time 0.01540 sec
nvprof python examples/cnn/train_cnn_half.py -pfloat32

Time(%)      Time     Calls       Avg       Min       Max  Name
 73.79%  1.1260ms         1  1.1260ms  1.1260ms  1.1260ms  generate_seed_pseudo(__int64, __int64, __int64, curandOrdering, curandStateXORWOW*, unsigned int*)
  5.29%  80.708us         1  80.708us  80.708us  80.708us  void calc_bias_diff<int=2, float, float, int=128, int=0>(cudnnTensorStruct, float const *, cudnnTensorStruct, float*, float, float, int)
  2.63%  40.099us         1  40.099us  40.099us  40.099us  void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>*, kernel_grad_params, int, int, float, int, int, int)
  2.28%  34.721us         3  11.573us  10.848us  12.800us  void fft2d_r2c_32x32<float, bool=0, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  1.65%  25.121us         1  25.121us  25.121us  25.121us  volta_gcgemm_32x32_nt
  1.32%  20.160us        14  1.4400us  1.2800us  1.9520us  void setTensor4d_kernel<float, float, int=16, int=16>(cudnnTensor4dStruct, float*, float)
  1.29%  19.617us         9  2.1790us  1.1200us  10.017us  [CUDA memcpy HtoD]
  1.22%  18.594us        13  1.4300us  1.1840us  3.5210us  [CUDA memcpy DtoD]
  1.04%  15.905us         1  15.905us  15.905us  15.905us  void fermiPlusCgemmLDS128_batched<bool=1, bool=0, bool=0, bool=0, int=4, int=4, int=4, int=3, int=3, bool=1, bool=0>(float2* const *, float2* const *, float2* const *, float2*, float2 const *, float2 const *, int, int, int, int, int, int, __int64, __int64, __int64, float2 const *, float2 const *, float2, float2, int)
  0.78%  11.872us         1  11.872us  11.872us  11.872us  void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=0, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
  0.68%  10.369us         1  10.369us  10.369us  10.369us  void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=1>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  0.63%  9.6320us         1  9.6320us  9.6320us  9.6320us  volta_sgemm_32x32_sliced1x4_nn
  0.62%  9.4720us         1  9.4720us  9.4720us  9.4720us  void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
  0.61%  9.2490us         1  9.2490us  9.2490us  9.2490us  volta_sgemm_32x32_sliced1x4_nt
  0.56%  8.4800us         5  1.6960us  1.2480us  3.2320us  singa::cuda::KernelMult(unsigned long, float const *, float, float*)
  0.51%  7.7760us         4  1.9440us  1.7280us  2.2720us  [CUDA memset]
  0.48%  7.3600us         4  1.8400us  1.4720us  2.0160us  void axpy_kernel_ref<float, float>(cublasAxpyParamsRef<float, float, float>)
  0.42%  6.4000us         1  6.4000us  6.4000us  6.4000us  volta_sgemm_128x32_tn
  0.38%  5.7280us         4  1.4320us  1.2800us  1.6640us  singa::cuda::KernelSub(unsigned long, float const *, float const *, float*)
  0.36%  5.4400us         2  2.7200us  1.7920us  3.6480us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
  0.29%  4.3850us         2  2.1920us  1.2480us  3.1370us  singa::cuda::KernelAdd(unsigned long, float const *, float, float*)
  0.27%  4.1600us         1  4.1600us  4.1600us  4.1600us  void flip_filter<float, float>(float*, float const *, int, int, int, int)
  0.26%  4.0320us         2  2.0160us  2.0160us  2.0160us  void op_generic_tensor_kernel<int=1, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
  0.25%  3.8730us         1  3.8730us  3.8730us  3.8730us  void gemv2N_kernel<int, int, float, float, float, int=128, int=32, int=4, int=4, int=1, cublasGemvParams<cublasGemvTensor<float const >, cublasGemvTensor<float>, float>>(float const )
  0.25%  3.8400us         1  3.8400us  3.8400us  3.8400us  void op_generic_tensor_kernel<int=4, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, reducedDivisorArray, int)
  0.25%  3.7770us         1  3.7770us  3.7770us  3.7770us  void cudnn::detail::softmax_fw_kernel_resident<int=2, float, float, int=256, int=1, int=0, int=0, int=32, int=0>(cudnnTensorStruct, float const *, cudnn::detail::softmax_fw_kernel_resident<int=2, float, float, int=256, int=1, int=0, int=0, int=32, int=0>, float*, int, float, float*, int, int)
  0.20%  3.1040us         1  3.1040us  3.1040us  3.1040us  singa::cuda::KernelReLUBackward(unsigned long, float const *, float const *, float*)
  0.20%  3.0400us         1  3.0400us  3.0400us  3.0400us  void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const *, float const *, float*, float const *, float const *)
  0.17%  2.6240us         1  2.6240us  2.6240us  2.6240us  void gemmk1_kernel<float, int=256, int=5, bool=0, bool=0, bool=0, bool=0, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>, float>(cublasGemmk1Params<float, float const , cublasGemvTensorStridedBatched<float const >, float>)
  0.17%  2.5600us         1  2.5600us  2.5600us  2.5600us  singa::cuda::KernelRelu(unsigned long, float const *, float*)
  0.16%  2.4960us         1  2.4960us  2.4960us  2.4960us  void reduce_1Block_kernel<float, int=128, int=7, cublasGemvTensorStridedBatched<float>, cublasGemvTensor<float>>(float const *, float, float, int, float const *, float, cublasGemvTensorStridedBatched<float>, cublasPointerMode_t)
  0.15%  2.2720us         1  2.2720us  2.2720us  2.2720us  void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
  0.14%  2.1770us         1  2.1770us  2.1770us  2.1770us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st)), rng_config<curandStateXORWOW>>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
  0.14%  2.1760us         1  2.1760us  2.1760us  2.1760us  void dot_kernel<float, int=128, int=0, cublasDotParams<cublasGemvTensor<float const >, cublasGemvTensorStridedBatched<float>>>(float const )
  0.12%  1.8880us         1  1.8880us  1.8880us  1.8880us  singa::cuda::KernelComputeCrossEntropy(bool, unsigned long, unsigned long, float const *, int const *, float*)
  0.12%  1.8240us         1  1.8240us  1.8240us  1.8240us  compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
  0.12%  1.8240us         1  1.8240us  1.8240us  1.8240us  singa::cuda::KernelSoftmaxCrossEntropyBwd(bool, unsigned long, unsigned long, float const *, int const *, float*)
  0.11%  1.6320us         1  1.6320us  1.6320us  1.6320us  [CUDA memcpy DtoH]
  0.10%  1.5040us         1  1.5040us  1.5040us  1.5040us  singa::cuda::KernelCastFloat2Int(unsigned long, float const *, int*)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment