Skip to content

Instantly share code, notes, and snippets.

@dcslin
Created September 27, 2020 03:31
Show Gist options
  • Save dcslin/02ccf905f5082adb1009cddeec90af7d to your computer and use it in GitHub Desktop.
Save dcslin/02ccf905f5082adb1009cddeec90af7d to your computer and use it in GitHub Desktop.
benchmark pytorch MNIST mixed precision training by Apex
"""
This script is modified from https://github.com/pytorch/examples.git
"""
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from apex import amp
import time
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout2d(0.25)
self.dropout2 = nn.Dropout2d(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
def train(args, model, device, train_loader, optimizer, epoch):
start_time = time.time()
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
# loss.backward()
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
if args.dry_run:
break
end_time=time.time()
print("training used time %.5f sec" %(end_time-start_time))
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
def main():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=14, metavar='N',
help='number of epochs to train (default: 14)')
parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
help='learning rate (default: 1.0)')
parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
help='Learning rate step gamma (default: 0.7)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--dry-run', action='store_true', default=False,
help='quickly check a single pass')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--save-model', action='store_true', default=False,
help='For Saving the current Model')
parser.add_argument('--opt-level', type=str)
args = parser.parse_args()
use_cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {'batch_size': args.batch_size}
if use_cuda:
kwargs.update({'num_workers': 1,
'pin_memory': True,
'shuffle': True},
)
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
dataset1 = datasets.MNIST('../data', train=True, download=True,
transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **kwargs)
model = Net().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level)
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
for epoch in range(1, args.epochs + 1):
train(args, model, device, train_loader, optimizer, epoch)
test(model, device, test_loader)
scheduler.step()
if args.save_model:
torch.save(model.state_dict(), "mnist_cnn.pt")
if __name__ == '__main__':
main()
@dcslin
Copy link
Author

dcslin commented Sep 27, 2020

Command:

python train.py --opt-level=O0
python train.py --opt-level=O1
python train.py --opt-level=O2
python train.py --opt-level=O3

Hardware:

1*GTX2080Ti
Cuda10.2
Cudnn7.6

Result:

O0(fp32 only): 14.21905 sec
O1(mixed, selected compute in fp16): 19.12530 sec
O2(almost fp16): 15.02236 sec
O3(fp16 only): 14.62414 sec

@dcslin
Copy link
Author

dcslin commented Sep 30, 2020

O3 - fp16

Time(%)      Time     Calls       Avg       Min       Max  Name
 47.98%  884.67ms      1876  471.57us  205.00us  658.40us  turing_s1688cudnn_fp16_128x128_ldg8_wgrad_idx_exp_interior_nhwc_nt_v1
  7.48%  137.90ms       938  147.02us  98.981us  149.51us  volta_fp16_s884cudnn_fp16_256x64_ldg8_dgrad_f2f_exp_small_nhwc2nchw_tt_v1
  3.86%  71.183ms      6880  10.346us  1.8570us  22.049us  void nchwToNhwcKernel<__half, __half, float, bool=1, bool=0>(int, int, int, int, __half const *, __half*, float, float)
  3.82%  70.370ms       938  75.021us  37.506us  76.451us  void at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_backward_nchw<c10::Half, float>(int, c10::Half const *, long const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_backward_nchw<c10::Half, float>*)
  3.32%  61.296ms      6099  10.050us     896ns  27.233us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_21threshold_kernel_implIN3c104HalfEEEvRNS_14TensorIteratorET_S7_EUlS4_S4_E_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  2.88%  53.054ms      1095  48.451us  18.081us  52.514us  turing_fp16_s1688cudnn_fp16_256x64_ldg8_relu_f2f_exp_interior_nhwc_tn_v1
  2.33%  42.928ms      2190  19.601us  7.2320us  24.673us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE8_clEvEUlNS4_4HalfES8_E_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESE_ILi1EjENS0_6memory15LoadWithoutCastENSH_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
  2.12%  39.164ms     15000  2.6100us  1.0880us  14.369us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE8_clEvEUlNS4_4HalfES8_E_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  2.07%  38.174ms      2204  17.320us     961ns  5.4919ms  [CUDA memcpy HtoD]
  1.92%  35.482ms     15008  2.3640us     960ns  10.080us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE8_clEvEUlN3c104HalfES8_E_EEvS4_RKT_EUlS8_E0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  1.74%  32.068ms      1876  17.093us  9.8880us  19.649us  _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIN3c104HalfENS0_14func_wrapper_tIS4_ZNS0_15sum_kernel_implIS4_fS4_EEvRNS_14TensorIteratorEEUlffE_EEjS4_Li4EEEEEvT1_
  1.72%  31.695ms     15008  2.1110us     992ns  14.144us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_19addcmul_cuda_kernelERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE6_clEvENKUlvE_clEvEUlNS4_4HalfES9_S9_E_NS_6detail5ArrayIPcLi4EEEEEviT0_T1_
  1.66%  30.622ms     15008  2.0400us     992ns  9.8890us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE8_clEvEUlNS5_4HalfES9_E_EEvS4_RKT_EUlS9_E0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  1.57%  28.964ms     15008  1.9290us     992ns  9.7600us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_16sqrt_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE3_clEvENKUlvE_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  1.31%  24.142ms      1095  22.047us  4.5120us  22.850us  void at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_forward_nchw<c10::Half, c10::Half>(int, c10::Half const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_forward_nchw<c10::Half, c10::Half>*, long*)
  1.09%  20.160ms      8450  2.3850us     768ns  10.432us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE9_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
  1.08%  19.827ms      1095  18.107us  4.3210us  19.585us  void nhwcToNchwKernel<__half, __half, float, bool=1, bool=0>(int, int, int, int, __half const *, __half*, float, float)
  0.97%  17.811ms      1093  16.295us  15.744us  18.753us  turing_s1688gemm_fp16_128x128_ldg8_tn
  0.95%  17.480ms      7504  2.3290us  1.1200us  14.497us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE0_clEvENKUlvE8_clEvEUlN3c104HalfES7_E_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.92%  16.907ms      1876  9.0120us  6.9760us  10.017us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE8_clEvEUlN3c104HalfES7_E_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESD_ILi1EjENS0_6memory15LoadWithoutCastENSG_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
  0.90%  16.632ms      9380  1.7730us     896ns  14.017us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE8_clEvEUlN3c104HalfES7_E_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.80%  14.739ms       937  15.730us  14.784us  19.233us  turing_fp16_s1688gemm_fp16_128x128_ldg8_f2f_nn
  0.66%  12.195ms      1094  11.147us  10.305us  11.616us  volta_fp16_scudnn_fp16_128x32_relu_interior_nn_v1
  0.56%  10.338ms       938  11.021us  8.2250us  11.936us  turing_fp16_s1688gemm_fp16_128x128_ldg8_f2f_nt
  0.54%  9.9298ms      2041  4.8650us  1.2800us  16.353us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE8_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESE_NS0_6memory12LoadWithCastILi1EEENSF_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
  0.50%  9.1646ms      1095  8.3690us  8.0650us  9.0240us  volta_fp16_sgemm_fp16_32x32_sliced1x4_tn
  0.48%  8.9079ms      2190  4.0670us  2.7200us  7.7760us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE8_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjESE_NS0_6memory15LoadWithoutCastENSF_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
  0.45%  8.2085ms       938  8.7510us  5.6000us  13.920us  void nchwToNhwcKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
  0.40%  7.3664ms      3760  1.9590us  1.0240us  17.729us  [CUDA memset]
  0.36%  6.6373ms       938  7.0760us  6.9120us  7.7440us  volta_fp16_sgemm_fp16_32x32_sliced1x4_nt
  0.35%  6.4816ms       938  6.9100us  5.0890us  7.4570us  _ZN2at6native13reduce_kernelILi128ELi4ENS0_8ReduceOpIN3c104HalfENS0_14func_wrapper_tIS4_ZNS0_15sum_kernel_implIS4_fS4_EEvRNS_14TensorIteratorEEUlffE_EEjS4_Li4EEEEEvT1_
  0.32%  5.9368ms       938  6.3290us  5.0240us  8.4800us  _ZN2at6native13reduce_kernelILi256ELi2ENS0_8ReduceOpIN3c104HalfENS0_14func_wrapper_tIS4_ZNS0_15sum_kernel_implIS4_fS4_EEvRNS_14TensorIteratorEEUlffE_EEjS4_Li4EEEEEvT1_
  0.28%  5.1727ms      3127  1.6540us  1.2800us  8.5130us  cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
  0.26%  4.7299ms      1095  4.3190us  4.0320us  8.0330us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
  0.26%  4.7163ms       937  5.0330us  4.8000us  8.5770us  volta_fp16_sgemm_fp16_128x32_nn
  0.24%  4.3862ms      1876  2.3380us  2.0800us  7.0400us  _ZN2at6native92_GLOBAL__N__68_tmpxft_00000d67_00000000_11_DistributionBernoulli_compute_75_cpp1_ii_cb3dce8443distribution_elementwise_grid_stride_kernelIfLi4EZNS0_9templates4cuda21uniform_and_transformIN3c104HalfEfLm4EPNS_17CUDAGeneratorImplEZZZNS4_16bernoulli_kernelIS9_EEvRNS_14TensorIteratorEdT_ENKUlvE_clEvENKUlvE6_clEvEUlfE_EEvSC_T2_T3_EUlP24curandStatePhilox4_32_10E0_ZNS1_27distribution_nullary_kernelIS7_fLi4ES9_SL_SG_EEvSC_SH_RKSI_T4_EUlifE_EEviSt4pairImmET1_SH_
  0.20%  3.6846ms      1095  3.3640us  2.9760us  6.1760us  void splitKreduce_kernel<float, __half, float>(cublasSplitKParams<float>, float const *, __half const *, __half*, float const *, float const *)
  0.20%  3.6346ms       938  3.8740us  3.4890us  10.305us  void nhwcToNchwKernel<float, __half, float, bool=1, bool=0>(int, int, int, int, float const *, __half*, float, float)
  0.19%  3.5205ms      1876  1.8760us  1.3440us  10.112us  cudnn::gemm::computeWgradOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
  0.19%  3.4473ms      1095  3.1480us  2.5280us  6.2720us  void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)
  0.17%  3.1329ms       938  3.3390us  3.2320us  7.3600us  void nhwcToNchwKernel<float, __half, float, bool=1, bool=1>(int, int, int, int, float const *, __half*, float, float)
  0.15%  2.8574ms      1876  1.5230us  1.1520us  8.1290us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE3_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.14%  2.6527ms      1876  1.4140us  1.0240us  8.3200us  void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
  0.12%  2.2396ms       938  2.3870us  2.1760us  10.209us  void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)
  0.11%  2.0994ms      1095  1.9170us  1.7280us  5.0570us  void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_forward<c10::Half, c10::Half, float, int=4, bool=1>(c10::Half*, c10::Half const *, int, int, int)
  0.08%  1.5511ms       938  1.6530us  1.4080us  6.4330us  void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_backward<c10::Half, c10::Half, float, int=4, bool=1>(c10::Half*, c10::Half const *, c10::Half const , int, int, int)
  0.07%  1.2831ms       938  1.3670us     992ns  9.7610us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
  0.07%  1.2609ms       938  1.3440us  1.0560us  5.3450us  cudnn::gemm::computeBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
  0.05%  831.46us       157  5.2950us  5.0880us  8.6400us  void at::native::reduce_kernel<int=512, int=1, at::native::ReduceOp<float, at::native::ArgMaxOps<float>, unsigned int, long, int=4>>(float)
  0.04%  737.77us       408  1.8080us  1.0880us  6.2080us  [CUDA memcpy DtoH]
  0.04%  691.00us       157  4.4010us  4.3200us  4.6400us  _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIlNS0_14func_wrapper_tIlZNS0_15sum_kernel_implIlllEEvRNS_14TensorIteratorEEUlllE_EEjlLi4EEEEEvT1_
  0.02%  374.52us       157  2.3850us  1.9520us  4.6720us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE4_clEvEUllE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
  0.01%  238.79us       157  1.5200us  1.2480us  4.4160us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14eq_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE4_clEvENKUlvE_clEvEUlllE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.00%  32.706us         2  16.353us  15.681us  17.025us  turing_s884gemm_fp16_128x64_ldg8_tn
  0.00%  10.369us         1  10.369us  10.369us  10.369us  turing_fp16_s884gemm_fp16_64x64_ldg8_f2f_nn
  0.00%  8.8010us         1  8.8010us  8.8010us  8.8010us  void cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, __half const *, int, __half*, cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, __half, __half, int, int)
  0.00%  5.1520us         1  5.1520us  5.1520us  5.1520us  volta_fp16_sgemm_fp16_32x128_nn

@dcslin
Copy link
Author

dcslin commented Sep 30, 2020

O0 fp32

Time(%)      Time     Calls       Avg       Min       Max  Name
  9.92%  182.58ms      3752  48.662us  8.6080us  97.381us  void fft2d_r2c_32x32<float, bool=0, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  8.18%  150.54ms      3127  48.141us  18.305us  63.971us  void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)
  7.64%  140.53ms      6099  23.041us     928ns  53.347us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_21threshold_kernel_implIfEEvRNS_14TensorIteratorET_S5_EUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  7.27%  133.87ms      3127  42.812us  4.6080us  102.76us  void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
  6.41%  117.98ms      2189  53.897us  19.617us  77.732us  volta_sgemm_128x64_nn
  4.89%  89.924ms       938  95.867us  50.306us  97.829us  volta_sgemm_128x64_nt
  4.66%  85.714ms      1876  45.689us  12.193us  78.468us  void flip_filter<float, float>(float*, float const *, int, int, int, int)
  4.16%  76.545ms       938  81.604us  50.275us  85.412us  volta_cgemm_32x32_tn
  3.96%  72.832ms       938  77.645us  40.130us  78.980us  void at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_backward_nchw<float, float>(int, float const *, long const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_backward_nchw<float, float>*)
  3.89%  71.509ms     15000  4.7670us  1.0880us  27.841us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  3.49%  64.184ms     15008  4.2760us     928ns  27.490us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_19addcmul_cuda_kernelERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlfffE_NS_6detail5ArrayIPcLi4EEEEEviT0_T1_
  3.29%  60.532ms      2190  27.640us  7.4560us  37.954us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESD_ILi1EjENS0_6memory15LoadWithoutCastENSG_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
  2.67%  49.128ms     15008  3.2730us     928ns  18.977us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUlfE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  2.67%  49.107ms     15008  3.2720us     928ns  20.097us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUlfE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  2.44%  44.844ms      1876  23.904us  10.176us  28.162us  _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
  2.17%  40.017ms     15008  2.6660us     992ns  19.361us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_16sqrt_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvENKUlvE_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  2.06%  37.948ms      1876  20.228us  8.6410us  32.129us  void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
  1.99%  36.579ms      9380  3.8990us     896ns  27.778us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  1.94%  35.756ms      1095  32.653us  4.6080us  33.570us  void at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_forward_nchw<float, float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_forward_nchw<float, float>*, long*)
  1.91%  35.077ms      1094  32.063us  21.729us  35.426us  volta_sgemm_64x32_sliced1x4_tn
  1.90%  34.882ms      2204  15.826us     992ns  2.8076ms  [CUDA memcpy HtoD]
  1.88%  34.581ms      7504  4.6080us  1.0560us  27.009us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE0_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  1.81%  33.295ms      9388  3.5460us     768ns  18.049us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
  1.32%  24.357ms       937  25.994us  25.409us  27.201us  volta_sgemm_64x64_nn
  1.15%  21.216ms       938  22.618us  14.913us  23.233us  volta_sgemm_128x32_nt
  1.08%  19.931ms      1876  10.624us  6.7840us  12.321us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESB_ILi1EjENS0_6memory15LoadWithoutCastENSE_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
  1.03%  18.878ms       938  20.126us  9.4090us  24.930us  void gemv2T_kernel_val<int, int, float2, float2, float2, int=128, int=16, int=2, int=2, bool=0, cublasGemvParams<cublasGemvTensorBatched<float2 const >, cublasGemvTensorBatched<float2>, float2>>(float2 const , float2, float2)
  0.54%  9.8585ms      3127  3.1520us  2.4640us  11.104us  void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)
  0.49%  9.0603ms      2190  4.1370us  3.3920us  9.2800us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjESC_NS0_6memory15LoadWithoutCastENSD_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
  0.46%  8.4426ms      1096  7.7030us  6.9440us  22.625us  volta_sgemm_32x32_sliced1x4_tn
  0.36%  6.6202ms       938  7.0570us  4.8970us  8.8000us  _ZN2at6native13reduce_kernelILi128ELi4ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
  0.34%  6.3149ms       938  6.7320us  6.4320us  7.1370us  volta_sgemm_32x32_sliced1x4_nt
  0.30%  5.5124ms       938  5.8760us  4.6080us  8.0320us  _ZN2at6native13reduce_kernelILi256ELi2ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
  0.25%  4.6503ms       938  4.9570us  4.8330us  7.6160us  volta_sgemm_32x128_nn
  0.24%  4.4169ms      1876  2.3540us  2.1120us  11.073us  _ZN2at6native92_GLOBAL__N__68_tmpxft_00000d67_00000000_11_DistributionBernoulli_compute_75_cpp1_ii_cb3dce8443distribution_elementwise_grid_stride_kernelIfLi4EZNS0_9templates4cuda21uniform_and_transformIffLm4EPNS_17CUDAGeneratorImplEZZZNS4_16bernoulli_kernelIS7_EEvRNS_14TensorIteratorEdT_ENKUlvE_clEvENKUlvE2_clEvEUlfE_EEvSA_T2_T3_EUlP24curandStatePhilox4_32_10E0_ZNS1_27distribution_nullary_kernelIffLi4ES7_SJ_SE_EEvSA_SF_RKSG_T4_EUlifE_EEviSt4pairImmET1_SF_
  0.19%  3.5246ms      1876  1.8780us  1.4400us  8.7370us  compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
  0.19%  3.4728ms      1095  3.1710us  2.5600us  8.6410us  void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)
  0.17%  3.0794ms      1095  2.8120us  2.4320us  6.0480us  void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const *, float const *, float*, float const *, float const *)
  0.15%  2.7375ms      1876  1.4590us  1.1520us  7.3930us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.12%  2.2652ms       938  2.4140us  2.1120us  5.3130us  void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)
  0.11%  2.0554ms      1095  1.8770us  1.6960us  6.1760us  void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_forward<float, float, float, int=4, bool=1>(float*, float const *, int, int, int)
  0.09%  1.5966ms       938  1.7020us  1.4080us  7.4250us  void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_backward<float, float, float, int=4, bool=1>(float*, float const *, float const , int, int, int)
  0.08%  1.4663ms       946  1.5500us  1.0560us  6.2720us  [CUDA memset]
  0.04%  824.06us       157  5.2480us  5.0560us  6.3680us  void at::native::reduce_kernel<int=512, int=1, at::native::ReduceOp<float, at::native::ArgMaxOps<float>, unsigned int, long, int=4>>(float)
  0.04%  699.49us       157  4.4550us  4.1600us  4.5770us  _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIlNS0_14func_wrapper_tIlZNS0_15sum_kernel_implIlllEEvRNS_14TensorIteratorEEUlllE_EEjlLi4EEEEEvT1_
  0.04%  673.12us       408  1.6490us  1.0880us  6.7840us  [CUDA memcpy DtoH]
  0.02%  371.25us       157  2.3640us  1.9840us  3.1680us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE4_clEvEUllE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
  0.01%  229.04us       157  1.4580us  1.2480us  2.4960us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14eq_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE4_clEvENKUlvE_clEvEUlllE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.00%  15.936us         1  15.936us  15.936us  15.936us  volta_sgemm_128x32_nn
  0.00%  9.6000us         1  9.6000us  9.6000us  9.6000us  volta_scudnn_128x32_relu_interior_nn_v1
  0.00%  1.3760us         1  1.3760us  1.3760us  1.3760us  cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment