Skip to content

Instantly share code, notes, and snippets.

@ilia-cher
Created February 3, 2021 09:19
Show Gist options
  • Save ilia-cher/1fa37ad4a5149fb5eeda1245c5583c69 to your computer and use it in GitHub Desktop.
Save ilia-cher/1fa37ad4a5149fb5eeda1245c5583c69 to your computer and use it in GitHub Desktop.
(pytorch) iliacher@devgpu083:~/local/pytorch (activities_default)$ python test_resnet50.py
Files already downloaded and verified
step:0
step:1
step:2
step:3
step:4
step:5
step:6
step:7
step:8
step:9
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
aten::thnn_conv2d_backward 8.25% 322.645ms 31.37% 1.226s 3.856ms 1.495s 61.17% 1.496s 4.705ms 318
aten::thnn_conv2d_forward 6.14% 239.948ms 13.05% 509.987ms 1.604ms 574.360ms 23.50% 574.360ms 1.806ms 318
void at::native::col2im_kernel<float, float>(long, f... 0.00% 0.000us 0.00% 0.000us 0.000us 509.336ms 20.84% 509.336ms 50.474us 10091
sgemm_32x32x32_NT_vec 0.00% 0.000us 0.00% 0.000us 0.000us 348.843ms 14.27% 348.843ms 50.896us 6854
sgemm_32x32x32_NN_vec 0.00% 0.000us 0.00% 0.000us 0.000us 266.423ms 10.90% 266.423ms 35.552us 7494
void at::native::im2col_kernel<float>(long, float co... 0.00% 0.000us 0.00% 0.000us 0.000us 246.419ms 10.08% 246.419ms 37.382us 6592
sgemm_32x32x32_TN_vec 0.00% 0.000us 0.00% 0.000us 0.000us 184.302ms 7.54% 184.302ms 28.748us 6411
sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 151.835ms 6.21% 151.835ms 87.867us 1728
maxwell_sgemm_128x64_tn 0.00% 0.000us 0.00% 0.000us 0.000us 106.377ms 4.35% 106.377ms 42.619us 2496
sgemm_32x32x32_TN 0.00% 0.000us 0.00% 0.000us 0.000us 106.306ms 4.35% 106.306ms 61.520us 1728
void at::native::batch_norm_backward_kernel<float, f... 0.00% 0.000us 0.00% 0.000us 0.000us 99.966ms 4.09% 99.966ms 310.453us 322
aten::native_batch_norm_backward 0.28% 10.982ms 0.76% 29.813ms 93.752us 96.933ms 3.97% 96.933ms 304.821us 318
aten::native_batch_norm 0.42% 16.396ms 0.88% 34.254ms 107.717us 72.668ms 2.97% 72.668ms 228.516us 318
maxwell_sgemm_128x64_nt 0.00% 0.000us 0.00% 0.000us 0.000us 72.624ms 2.97% 72.624ms 37.091us 1958
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 68.091ms 2.79% 68.091ms 115.213us 591
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 59.078ms 2.42% 59.078ms 17.244us 3426
sgemm_128x128x8_NT 0.00% 0.000us 0.00% 0.000us 0.000us 49.223ms 2.01% 49.223ms 64.092us 768
aten::threshold_backward 0.17% 6.594ms 0.32% 12.576ms 42.776us 39.858ms 1.63% 39.858ms 135.571us 294
void at::native::batch_norm_collect_statistics_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 38.091ms 1.56% 38.091ms 119.783us 318
void at::native::batch_norm_transform_input_kernel<f... 0.00% 0.000us 0.00% 0.000us 0.000us 34.577ms 1.41% 34.577ms 108.733us 318
aten::add_ 1.15% 44.860ms 5.02% 196.377ms 65.590us 34.567ms 1.41% 34.567ms 11.545us 2994
aten::threshold_ 0.10% 3.749ms 0.20% 7.971ms 27.112us 27.406ms 1.12% 27.406ms 93.218us 294
maxwell_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 24.644ms 1.01% 24.644ms 25.671us 960
void sgemm_largek_lds64<true, false, 5, 5, 4, 4, 4, ... 0.00% 0.000us 0.00% 0.000us 0.000us 21.423ms 0.88% 21.423ms 95.638us 224
Memcpy HtoD (Pageable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 18.097ms 0.74% 18.097ms 76.682us 236
aten::copy_ 0.69% 27.042ms 1.24% 48.288ms 82.122us 17.864ms 0.73% 17.864ms 30.381us 588
void at::native::(anonymous namespace)::max_pool_bac... 0.00% 0.000us 0.00% 0.000us 0.000us 13.865ms 0.57% 13.865ms 1.981ms 7
aten::max_pool2d_with_indices_backward 0.01% 197.000us 0.02% 684.000us 114.000us 11.886ms 0.49% 12.914ms 2.152ms 6
Memset (Device) 0.00% 0.000us 0.00% 0.000us 0.000us 11.047ms 0.45% 11.047ms 1.085us 10182
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.074ms 0.17% 4.074ms 3.131us 1301
aten::fill_ 0.26% 10.354ms 0.63% 24.759ms 19.104us 3.898ms 0.16% 3.898ms 3.008us 1296
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.512ms 0.14% 3.512ms 3.116us 1127
aten::mul_ 0.41% 15.957ms 1.78% 69.576ms 72.025us 3.011ms 0.12% 3.011ms 3.117us 966
aten::max_pool2d_with_indices 0.01% 316.000us 0.01% 558.000us 93.000us 2.965ms 0.12% 2.965ms 494.167us 6
void at::native::(anonymous namespace)::max_pool_for... 0.00% 0.000us 0.00% 0.000us 0.000us 2.965ms 0.12% 2.965ms 494.167us 6 [37/1909]
aten::mean 0.01% 261.000us 0.01% 419.000us 69.833us 695.000us 0.03% 695.000us 115.833us 6
void at::native::reduce_kernel<512, 1, at::native::R... 0.00% 0.000us 0.00% 0.000us 0.000us 695.000us 0.03% 695.000us 115.833us 6
void scal_kernel<float, float, 1, false, 6, 5, 5, 3>... 0.00% 0.000us 0.00% 0.000us 0.000us 643.000us 0.03% 643.000us 2.871us 224
aten::mm 0.01% 330.000us 0.01% 583.000us 48.583us 482.000us 0.02% 482.000us 40.167us 12
aten::div 0.52% 20.235ms 0.61% 23.698ms 119.687us 456.000us 0.02% 456.000us 2.303us 198
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 456.000us 0.02% 456.000us 76.000us 6
aten::addmm 0.01% 431.000us 0.02% 762.000us 127.000us 450.000us 0.02% 450.000us 75.000us 6
void kernelPointwiseApply1<TensorFillOp<float>, floa... 0.00% 0.000us 0.00% 0.000us 0.000us 400.000us 0.02% 400.000us 1.238us 323
aten::add 0.21% 8.018ms 0.34% 13.350ms 41.981us 391.000us 0.02% 391.000us 1.230us 318
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 391.000us 0.02% 391.000us 1.230us 318
aten::_log_softmax 0.00% 171.000us 0.01% 418.000us 69.667us 94.000us 0.00% 94.000us 15.667us 6
void (anonymous namespace)::softmax_warp_forward<flo... 0.00% 0.000us 0.00% 0.000us 0.000us 94.000us 0.00% 94.000us 15.667us 6
void at::native::reduce_kernel<128, 4, at::native::R... 0.00% 0.000us 0.00% 0.000us 0.000us 84.000us 0.00% 84.000us 14.000us 6
aten::_log_softmax_backward_data 0.00% 170.000us 0.01% 462.000us 77.000us 83.000us 0.00% 83.000us 13.833us 6
void (anonymous namespace)::softmax_warp_backward<fl... 0.00% 0.000us 0.00% 0.000us 0.000us 83.000us 0.00% 83.000us 13.833us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 55.000us 0.00% 55.000us 9.167us 6
aten::nll_loss_forward 0.01% 280.000us 0.01% 384.000us 64.000us 34.000us 0.00% 34.000us 5.667us 6
void cunn_ClassNLLCriterion_updateOutput_kernel<floa... 0.00% 0.000us 0.00% 0.000us 0.000us 34.000us 0.00% 34.000us 5.667us 6
aten::nll_loss_backward 0.01% 233.000us 0.01% 494.000us 82.333us 25.000us 0.00% 25.000us 4.167us 6
void cunn_ClassNLLCriterion_updateGradInput_kernel<f... 0.00% 0.000us 0.00% 0.000us 0.000us 19.000us 0.00% 19.000us 3.167us 6
aten::zeros 0.01% 510.000us 0.02% 650.000us 27.083us 0.000us 0.00% 0.000us 0.000us 24
aten::empty 0.48% 18.886ms 0.48% 18.886ms 4.911us 0.000us 0.00% 0.000us 0.000us 3846
aten::zero_ 0.30% 11.740ms 0.93% 36.334ms 27.651us 0.000us 0.00% 3.892ms 2.962us 1314
ProfilerStep* 36.30% 1.419s 65.70% 2.568s 428.021ms 0.000us 0.00% 732.149ms 122.025ms 6
enumerate(DataLoader)#_SingleProcessDataLoaderIter._... 4.00% 156.369ms 6.31% 246.581ms 41.097ms 0.000us 0.00% 0.000us 0.000us 6
aten::set_ 4.10% 160.433ms 4.10% 160.433ms 2.636us 0.000us 0.00% 0.000us 0.000us 60864
aten::view 0.19% 7.597ms 0.19% 7.597ms 2.771us 0.000us 0.00% 0.000us 0.000us 2742
aten::permute 0.19% 7.507ms 0.20% 7.850ms 40.885us 0.000us 0.00% 0.000us 0.000us 192
aten::as_strided 0.02% 674.000us 0.02% 674.000us 1.560us 0.000us 0.00% 0.000us 0.000us 432
aten::contiguous 0.03% 1.284ms 0.41% 15.970ms 83.177us 0.000us 0.00% 0.000us 0.000us 192
aten::empty_like 0.18% 6.964ms 0.39% 15.097ms 10.146us 0.000us 0.00% 0.000us 0.000us 1488
aten::to 0.08% 3.213ms 1.02% 39.888ms 97.765us 0.000us 0.00% 17.864ms 43.784us 408
aten::empty_strided 0.05% 1.808ms 0.05% 1.808ms 4.498us 0.000us 0.00% 0.000us 0.000us 402
aten::stack 0.01% 472.000us 0.60% 23.545ms 3.924ms 0.000us 0.00% 0.000us 0.000us 6
aten::unsqueeze 0.02% 682.000us 0.02% 910.000us 4.740us 0.000us 0.00% 0.000us 0.000us 192
aten::cat 0.00% 66.000us 0.57% 22.163ms 3.694ms 0.000us 0.00% 0.000us 0.000us 6
aten::_cat 0.56% 21.916ms 0.57% 22.097ms 3.683ms 0.000us 0.00% 0.000us 0.000us 6
aten::resize_ 0.06% 2.271ms 0.06% 2.271ms 6.526us 0.000us 0.00% 0.000us 0.000us 348
aten::narrow 0.00% 29.000us 0.00% 74.000us 12.333us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 0.00% 35.000us 0.00% 45.000us 7.500us 0.000us 0.00% 0.000us 0.000us 6
aten::detach_ 0.00% 23.000us 0.00% 38.000us 6.333us 0.000us 0.00% 0.000us 0.000us 6
detach_ 0.00% 15.000us 0.00% 15.000us 2.500us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 1.02% 39.746ms 1.02% 39.746ms 194.833us 0.000us 0.00% 0.000us 0.000us 204
cudaStreamSynchronize 0.01% 461.000us 0.01% 461.000us 38.417us 0.000us 0.00% 0.000us 0.000us 12
aten::conv2d 0.08% 3.054ms 13.46% 526.234ms 1.655ms 0.000us 0.00% 574.360ms 1.806ms 318
aten::convolution 0.07% 2.805ms 13.38% 523.180ms 1.645ms 0.000us 0.00% 574.360ms 1.806ms 318
aten::_convolution 0.10% 3.858ms 13.31% 520.375ms 1.636ms 0.000us 0.00% 574.360ms 1.806ms 318
aten::_convolution_nogroup 0.09% 3.620ms 13.21% 516.517ms 1.624ms 0.000us 0.00% 574.360ms 1.806ms 318
aten::_nnpack_available 0.01% 260.000us 0.01% 260.000us 0.818us 0.000us 0.00% 0.000us 0.000us 318
aten::thnn_conv2d 0.07% 2.650ms 13.11% 512.637ms 1.612ms 0.000us 0.00% 574.360ms 1.806ms 318
cudaMemsetAsync 2.88% 112.451ms 2.88% 112.451ms 11.044us 0.000us 0.00% 0.000us 0.000us 10182
cudaLaunchKernel 28.59% 1.118s 28.59% 1.118s 20.453us 0.000us 0.00% 0.000us 0.000us 54648
aten::batch_norm 0.08% 2.978ms 1.13% 44.213ms 139.035us 0.000us 0.00% 72.668ms 228.516us 318
aten::_batch_norm_impl_index 0.14% 5.408ms 1.05% 41.235ms 129.670us 0.000us 0.00% 72.668ms 228.516us 318
aten::reshape 0.11% 4.326ms 0.17% 6.680ms 6.958us 0.000us 0.00% 0.000us 0.000us 960
aten::relu_ 0.15% 5.774ms 0.35% 13.745ms 46.752us 0.000us 0.00% 27.406ms 93.218us 294
aten::max_pool2d 0.00% 82.000us 0.02% 640.000us 106.667us 0.000us 0.00% 2.965ms 494.167us 6
aten::adaptive_avg_pool2d 0.00% 79.000us 0.01% 498.000us 83.000us 0.000us 0.00% 695.000us 115.833us 6
aten::flatten 0.00% 46.000us 0.00% 141.000us 23.500us 0.000us 0.00% 0.000us 0.000us 6
aten::t 0.01% 275.000us 0.01% 575.000us 19.167us 0.000us 0.00% 0.000us 0.000us 30
aten::transpose 0.01% 226.000us 0.01% 300.000us 10.000us 0.000us 0.00% 0.000us 0.000us 30
aten::expand 0.00% 86.000us 0.00% 105.000us 8.750us 0.000us 0.00% 0.000us 0.000us 12
aten::log_softmax 0.00% 61.000us 0.01% 479.000us 79.833us 0.000us 0.00% 94.000us 15.667us 6
aten::nll_loss 0.00% 47.000us 0.01% 431.000us 71.833us 0.000us 0.00% 34.000us 5.667us 6
Optimizer.zero_grad#SGD.zero_grad 0.24% 9.235ms 0.91% 35.664ms 5.944ms 0.000us 0.00% 1.808ms 301.333us 6
aten::ones_like 0.00% 90.000us 0.01% 368.000us 61.333us 0.000us 0.00% 6.000us 1.000us 6
NllLossBackward 0.00% 144.000us 0.02% 638.000us 106.333us 0.000us 0.00% 25.000us 4.167us 6
LogSoftmaxBackward 0.00% 98.000us 0.01% 560.000us 93.333us 0.000us 0.00% 83.000us 13.833us 6
AddmmBackward 0.00% 187.000us 0.03% 1.109ms 184.833us 0.000us 0.00% 482.000us 80.333us 6
aten::conj 0.00% 41.000us 0.00% 41.000us 3.417us 0.000us 0.00% 0.000us 0.000us 12
torch::autograd::AccumulateGrad 0.15% 5.680ms 1.36% 53.090ms 54.959us 0.000us 0.00% 4.170ms 4.317us 966
TBackward 0.00% 21.000us 0.00% 124.000us 20.667us 0.000us 0.00% 0.000us 0.000us 6
ViewBackward 0.00% 24.000us 0.00% 98.000us 16.333us 0.000us 0.00% 0.000us 0.000us 6
MeanBackward1 0.00% 65.000us 0.01% 495.000us 82.500us 0.000us 0.00% 456.000us 76.000us 6
ReluBackward1 0.06% 2.240ms 0.38% 14.816ms 50.395us 0.000us 0.00% 39.858ms 135.571us 294
AddBackward0 0.01% 252.000us 0.01% 252.000us 2.625us 0.000us 0.00% 0.000us 0.000us 96
NativeBatchNormBackward 0.11% 4.314ms 0.87% 34.127ms 107.318us 0.000us 0.00% 96.933ms 304.821us 318
ThnnConv2DBackward 0.11% 4.462ms 31.48% 1.231s 3.870ms 0.000us 0.00% 1.496s 4.705ms 318
MaxPool2DWithIndicesBackward 0.00% 75.000us 0.02% 759.000us 126.500us 0.000us 0.00% 12.914ms 2.152ms 6
aten::zeros_like 0.00% 44.000us 0.01% 300.000us 50.000us 0.000us 0.00% 1.028ms 171.333us 6
aten::resize_as_ 0.00% 79.000us 0.00% 87.000us 14.500us 0.000us 0.00% 0.000us 0.000us 6
cudaEventQuery 0.01% 258.000us 0.01% 258.000us 1.344us 0.000us 0.00% 0.000us 0.000us 192
cudaEventRecord 0.00% 183.000us 0.00% 183.000us 0.953us 0.000us 0.00% 0.000us 0.000us 192
Optimizer.step#SGD.step 0.64% 24.981ms 6.13% 239.797ms 39.966ms 0.000us 0.00% 10.482ms 1.747ms 6
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 3.909s
Self CUDA time total: 2.444s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment