Created
September 27, 2020 03:31
-
-
Save dcslin/02ccf905f5082adb1009cddeec90af7d to your computer and use it in GitHub Desktop.
benchmark pytorch MNIST mixed precision training by Apex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script is modified from https://github.com/pytorch/examples.git | |
""" | |
from __future__ import print_function | |
import argparse | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import torch.optim as optim | |
from torchvision import datasets, transforms | |
from torch.optim.lr_scheduler import StepLR | |
from apex import amp | |
import time | |
class Net(nn.Module): | |
def __init__(self): | |
super(Net, self).__init__() | |
self.conv1 = nn.Conv2d(1, 32, 3, 1) | |
self.conv2 = nn.Conv2d(32, 64, 3, 1) | |
self.dropout1 = nn.Dropout2d(0.25) | |
self.dropout2 = nn.Dropout2d(0.5) | |
self.fc1 = nn.Linear(9216, 128) | |
self.fc2 = nn.Linear(128, 10) | |
def forward(self, x): | |
x = self.conv1(x) | |
x = F.relu(x) | |
x = self.conv2(x) | |
x = F.relu(x) | |
x = F.max_pool2d(x, 2) | |
x = self.dropout1(x) | |
x = torch.flatten(x, 1) | |
x = self.fc1(x) | |
x = F.relu(x) | |
x = self.dropout2(x) | |
x = self.fc2(x) | |
output = F.log_softmax(x, dim=1) | |
return output | |
def train(args, model, device, train_loader, optimizer, epoch): | |
start_time = time.time() | |
model.train() | |
for batch_idx, (data, target) in enumerate(train_loader): | |
data, target = data.to(device), target.to(device) | |
optimizer.zero_grad() | |
output = model(data) | |
loss = F.nll_loss(output, target) | |
# loss.backward() | |
with amp.scale_loss(loss, optimizer) as scaled_loss: | |
scaled_loss.backward() | |
optimizer.step() | |
if batch_idx % args.log_interval == 0: | |
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( | |
epoch, batch_idx * len(data), len(train_loader.dataset), | |
100. * batch_idx / len(train_loader), loss.item())) | |
if args.dry_run: | |
break | |
end_time=time.time() | |
print("training used time %.5f sec" %(end_time-start_time)) | |
def test(model, device, test_loader): | |
model.eval() | |
test_loss = 0 | |
correct = 0 | |
with torch.no_grad(): | |
for data, target in test_loader: | |
data, target = data.to(device), target.to(device) | |
output = model(data) | |
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss | |
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability | |
correct += pred.eq(target.view_as(pred)).sum().item() | |
test_loss /= len(test_loader.dataset) | |
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( | |
test_loss, correct, len(test_loader.dataset), | |
100. * correct / len(test_loader.dataset))) | |
def main(): | |
# Training settings | |
parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |
parser.add_argument('--batch-size', type=int, default=64, metavar='N', | |
help='input batch size for training (default: 64)') | |
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', | |
help='input batch size for testing (default: 1000)') | |
parser.add_argument('--epochs', type=int, default=14, metavar='N', | |
help='number of epochs to train (default: 14)') | |
parser.add_argument('--lr', type=float, default=1.0, metavar='LR', | |
help='learning rate (default: 1.0)') | |
parser.add_argument('--gamma', type=float, default=0.7, metavar='M', | |
help='Learning rate step gamma (default: 0.7)') | |
parser.add_argument('--no-cuda', action='store_true', default=False, | |
help='disables CUDA training') | |
parser.add_argument('--dry-run', action='store_true', default=False, | |
help='quickly check a single pass') | |
parser.add_argument('--seed', type=int, default=1, metavar='S', | |
help='random seed (default: 1)') | |
parser.add_argument('--log-interval', type=int, default=10, metavar='N', | |
help='how many batches to wait before logging training status') | |
parser.add_argument('--save-model', action='store_true', default=False, | |
help='For Saving the current Model') | |
parser.add_argument('--opt-level', type=str) | |
args = parser.parse_args() | |
use_cuda = not args.no_cuda and torch.cuda.is_available() | |
torch.manual_seed(args.seed) | |
device = torch.device("cuda" if use_cuda else "cpu") | |
kwargs = {'batch_size': args.batch_size} | |
if use_cuda: | |
kwargs.update({'num_workers': 1, | |
'pin_memory': True, | |
'shuffle': True}, | |
) | |
transform=transforms.Compose([ | |
transforms.ToTensor(), | |
transforms.Normalize((0.1307,), (0.3081,)) | |
]) | |
dataset1 = datasets.MNIST('../data', train=True, download=True, | |
transform=transform) | |
dataset2 = datasets.MNIST('../data', train=False, | |
transform=transform) | |
train_loader = torch.utils.data.DataLoader(dataset1,**kwargs) | |
test_loader = torch.utils.data.DataLoader(dataset2, **kwargs) | |
model = Net().to(device) | |
optimizer = optim.Adadelta(model.parameters(), lr=args.lr) | |
model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) | |
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) | |
for epoch in range(1, args.epochs + 1): | |
train(args, model, device, train_loader, optimizer, epoch) | |
test(model, device, test_loader) | |
scheduler.step() | |
if args.save_model: | |
torch.save(model.state_dict(), "mnist_cnn.pt") | |
if __name__ == '__main__': | |
main() |
O3 - fp16
Time(%) Time Calls Avg Min Max Name
47.98% 884.67ms 1876 471.57us 205.00us 658.40us turing_s1688cudnn_fp16_128x128_ldg8_wgrad_idx_exp_interior_nhwc_nt_v1
7.48% 137.90ms 938 147.02us 98.981us 149.51us volta_fp16_s884cudnn_fp16_256x64_ldg8_dgrad_f2f_exp_small_nhwc2nchw_tt_v1
3.86% 71.183ms 6880 10.346us 1.8570us 22.049us void nchwToNhwcKernel<__half, __half, float, bool=1, bool=0>(int, int, int, int, __half const *, __half*, float, float)
3.82% 70.370ms 938 75.021us 37.506us 76.451us void at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_backward_nchw<c10::Half, float>(int, c10::Half const *, long const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_backward_nchw<c10::Half, float>*)
3.32% 61.296ms 6099 10.050us 896ns 27.233us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_21threshold_kernel_implIN3c104HalfEEEvRNS_14TensorIteratorET_S7_EUlS4_S4_E_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
2.88% 53.054ms 1095 48.451us 18.081us 52.514us turing_fp16_s1688cudnn_fp16_256x64_ldg8_relu_f2f_exp_interior_nhwc_tn_v1
2.33% 42.928ms 2190 19.601us 7.2320us 24.673us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE8_clEvEUlNS4_4HalfES8_E_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESE_ILi1EjENS0_6memory15LoadWithoutCastENSH_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
2.12% 39.164ms 15000 2.6100us 1.0880us 14.369us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE8_clEvEUlNS4_4HalfES8_E_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
2.07% 38.174ms 2204 17.320us 961ns 5.4919ms [CUDA memcpy HtoD]
1.92% 35.482ms 15008 2.3640us 960ns 10.080us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE8_clEvEUlN3c104HalfES8_E_EEvS4_RKT_EUlS8_E0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
1.74% 32.068ms 1876 17.093us 9.8880us 19.649us _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIN3c104HalfENS0_14func_wrapper_tIS4_ZNS0_15sum_kernel_implIS4_fS4_EEvRNS_14TensorIteratorEEUlffE_EEjS4_Li4EEEEEvT1_
1.72% 31.695ms 15008 2.1110us 992ns 14.144us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_19addcmul_cuda_kernelERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE6_clEvENKUlvE_clEvEUlNS4_4HalfES9_S9_E_NS_6detail5ArrayIPcLi4EEEEEviT0_T1_
1.66% 30.622ms 15008 2.0400us 992ns 9.8890us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE8_clEvEUlNS5_4HalfES9_E_EEvS4_RKT_EUlS9_E0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
1.57% 28.964ms 15008 1.9290us 992ns 9.7600us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_16sqrt_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE3_clEvENKUlvE_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
1.31% 24.142ms 1095 22.047us 4.5120us 22.850us void at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_forward_nchw<c10::Half, c10::Half>(int, c10::Half const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_forward_nchw<c10::Half, c10::Half>*, long*)
1.09% 20.160ms 8450 2.3850us 768ns 10.432us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE9_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
1.08% 19.827ms 1095 18.107us 4.3210us 19.585us void nhwcToNchwKernel<__half, __half, float, bool=1, bool=0>(int, int, int, int, __half const *, __half*, float, float)
0.97% 17.811ms 1093 16.295us 15.744us 18.753us turing_s1688gemm_fp16_128x128_ldg8_tn
0.95% 17.480ms 7504 2.3290us 1.1200us 14.497us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE0_clEvENKUlvE8_clEvEUlN3c104HalfES7_E_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.92% 16.907ms 1876 9.0120us 6.9760us 10.017us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE8_clEvEUlN3c104HalfES7_E_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESD_ILi1EjENS0_6memory15LoadWithoutCastENSG_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
0.90% 16.632ms 9380 1.7730us 896ns 14.017us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE8_clEvEUlN3c104HalfES7_E_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.80% 14.739ms 937 15.730us 14.784us 19.233us turing_fp16_s1688gemm_fp16_128x128_ldg8_f2f_nn
0.66% 12.195ms 1094 11.147us 10.305us 11.616us volta_fp16_scudnn_fp16_128x32_relu_interior_nn_v1
0.56% 10.338ms 938 11.021us 8.2250us 11.936us turing_fp16_s1688gemm_fp16_128x128_ldg8_f2f_nt
0.54% 9.9298ms 2041 4.8650us 1.2800us 16.353us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE8_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESE_NS0_6memory12LoadWithCastILi1EEENSF_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
0.50% 9.1646ms 1095 8.3690us 8.0650us 9.0240us volta_fp16_sgemm_fp16_32x32_sliced1x4_tn
0.48% 8.9079ms 2190 4.0670us 2.7200us 7.7760us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE8_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjESE_NS0_6memory15LoadWithoutCastENSF_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
0.45% 8.2085ms 938 8.7510us 5.6000us 13.920us void nchwToNhwcKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
0.40% 7.3664ms 3760 1.9590us 1.0240us 17.729us [CUDA memset]
0.36% 6.6373ms 938 7.0760us 6.9120us 7.7440us volta_fp16_sgemm_fp16_32x32_sliced1x4_nt
0.35% 6.4816ms 938 6.9100us 5.0890us 7.4570us _ZN2at6native13reduce_kernelILi128ELi4ENS0_8ReduceOpIN3c104HalfENS0_14func_wrapper_tIS4_ZNS0_15sum_kernel_implIS4_fS4_EEvRNS_14TensorIteratorEEUlffE_EEjS4_Li4EEEEEvT1_
0.32% 5.9368ms 938 6.3290us 5.0240us 8.4800us _ZN2at6native13reduce_kernelILi256ELi2ENS0_8ReduceOpIN3c104HalfENS0_14func_wrapper_tIS4_ZNS0_15sum_kernel_implIS4_fS4_EEvRNS_14TensorIteratorEEUlffE_EEjS4_Li4EEEEEvT1_
0.28% 5.1727ms 3127 1.6540us 1.2800us 8.5130us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.26% 4.7299ms 1095 4.3190us 4.0320us 8.0330us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
0.26% 4.7163ms 937 5.0330us 4.8000us 8.5770us volta_fp16_sgemm_fp16_128x32_nn
0.24% 4.3862ms 1876 2.3380us 2.0800us 7.0400us _ZN2at6native92_GLOBAL__N__68_tmpxft_00000d67_00000000_11_DistributionBernoulli_compute_75_cpp1_ii_cb3dce8443distribution_elementwise_grid_stride_kernelIfLi4EZNS0_9templates4cuda21uniform_and_transformIN3c104HalfEfLm4EPNS_17CUDAGeneratorImplEZZZNS4_16bernoulli_kernelIS9_EEvRNS_14TensorIteratorEdT_ENKUlvE_clEvENKUlvE6_clEvEUlfE_EEvSC_T2_T3_EUlP24curandStatePhilox4_32_10E0_ZNS1_27distribution_nullary_kernelIS7_fLi4ES9_SL_SG_EEvSC_SH_RKSI_T4_EUlifE_EEviSt4pairImmET1_SH_
0.20% 3.6846ms 1095 3.3640us 2.9760us 6.1760us void splitKreduce_kernel<float, __half, float>(cublasSplitKParams<float>, float const *, __half const *, __half*, float const *, float const *)
0.20% 3.6346ms 938 3.8740us 3.4890us 10.305us void nhwcToNchwKernel<float, __half, float, bool=1, bool=0>(int, int, int, int, float const *, __half*, float, float)
0.19% 3.5205ms 1876 1.8760us 1.3440us 10.112us cudnn::gemm::computeWgradOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.19% 3.4473ms 1095 3.1480us 2.5280us 6.2720us void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)
0.17% 3.1329ms 938 3.3390us 3.2320us 7.3600us void nhwcToNchwKernel<float, __half, float, bool=1, bool=1>(int, int, int, int, float const *, __half*, float, float)
0.15% 2.8574ms 1876 1.5230us 1.1520us 8.1290us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE3_clEvEUlN3c104HalfEE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
0.14% 2.6527ms 1876 1.4140us 1.0240us 8.3200us void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
0.12% 2.2396ms 938 2.3870us 2.1760us 10.209us void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)
0.11% 2.0994ms 1095 1.9170us 1.7280us 5.0570us void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_forward<c10::Half, c10::Half, float, int=4, bool=1>(c10::Half*, c10::Half const *, int, int, int)
0.08% 1.5511ms 938 1.6530us 1.4080us 6.4330us void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_backward<c10::Half, c10::Half, float, int=4, bool=1>(c10::Half*, c10::Half const *, c10::Half const , int, int, int)
0.07% 1.2831ms 938 1.3670us 992ns 9.7610us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
0.07% 1.2609ms 938 1.3440us 1.0560us 5.3450us cudnn::gemm::computeBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
0.05% 831.46us 157 5.2950us 5.0880us 8.6400us void at::native::reduce_kernel<int=512, int=1, at::native::ReduceOp<float, at::native::ArgMaxOps<float>, unsigned int, long, int=4>>(float)
0.04% 737.77us 408 1.8080us 1.0880us 6.2080us [CUDA memcpy DtoH]
0.04% 691.00us 157 4.4010us 4.3200us 4.6400us _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIlNS0_14func_wrapper_tIlZNS0_15sum_kernel_implIlllEEvRNS_14TensorIteratorEEUlllE_EEjlLi4EEEEEvT1_
0.02% 374.52us 157 2.3850us 1.9520us 4.6720us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE4_clEvEUllE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
0.01% 238.79us 157 1.5200us 1.2480us 4.4160us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14eq_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE4_clEvENKUlvE_clEvEUlllE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.00% 32.706us 2 16.353us 15.681us 17.025us turing_s884gemm_fp16_128x64_ldg8_tn
0.00% 10.369us 1 10.369us 10.369us 10.369us turing_fp16_s884gemm_fp16_64x64_ldg8_f2f_nn
0.00% 8.8010us 1 8.8010us 8.8010us 8.8010us void cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, __half const *, int, __half*, cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, __half, __half, int, int)
0.00% 5.1520us 1 5.1520us 5.1520us 5.1520us volta_fp16_sgemm_fp16_32x128_nn
O0 fp32
Time(%) Time Calls Avg Min Max Name
9.92% 182.58ms 3752 48.662us 8.6080us 97.381us void fft2d_r2c_32x32<float, bool=0, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
8.18% 150.54ms 3127 48.141us 18.305us 63.971us void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)
7.64% 140.53ms 6099 23.041us 928ns 53.347us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_21threshold_kernel_implIfEEvRNS_14TensorIteratorET_S5_EUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
7.27% 133.87ms 3127 42.812us 4.6080us 102.76us void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
6.41% 117.98ms 2189 53.897us 19.617us 77.732us volta_sgemm_128x64_nn
4.89% 89.924ms 938 95.867us 50.306us 97.829us volta_sgemm_128x64_nt
4.66% 85.714ms 1876 45.689us 12.193us 78.468us void flip_filter<float, float>(float*, float const *, int, int, int, int)
4.16% 76.545ms 938 81.604us 50.275us 85.412us volta_cgemm_32x32_tn
3.96% 72.832ms 938 77.645us 40.130us 78.980us void at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_backward_nchw<float, float>(int, float const *, long const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_backward_nchw<float, float>*)
3.89% 71.509ms 15000 4.7670us 1.0880us 27.841us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
3.49% 64.184ms 15008 4.2760us 928ns 27.490us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_19addcmul_cuda_kernelERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlfffE_NS_6detail5ArrayIPcLi4EEEEEviT0_T1_
3.29% 60.532ms 2190 27.640us 7.4560us 37.954us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESD_ILi1EjENS0_6memory15LoadWithoutCastENSG_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
2.67% 49.128ms 15008 3.2730us 928ns 18.977us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUlfE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
2.67% 49.107ms 15008 3.2720us 928ns 20.097us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUlfE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
2.44% 44.844ms 1876 23.904us 10.176us 28.162us _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
2.17% 40.017ms 15008 2.6660us 992ns 19.361us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_16sqrt_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvENKUlvE_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
2.06% 37.948ms 1876 20.228us 8.6410us 32.129us void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
1.99% 36.579ms 9380 3.8990us 896ns 27.778us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
1.94% 35.756ms 1095 32.653us 4.6080us 33.570us void at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_forward_nchw<float, float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_forward_nchw<float, float>*, long*)
1.91% 35.077ms 1094 32.063us 21.729us 35.426us volta_sgemm_64x32_sliced1x4_tn
1.90% 34.882ms 2204 15.826us 992ns 2.8076ms [CUDA memcpy HtoD]
1.88% 34.581ms 7504 4.6080us 1.0560us 27.009us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE0_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
1.81% 33.295ms 9388 3.5460us 768ns 18.049us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
1.32% 24.357ms 937 25.994us 25.409us 27.201us volta_sgemm_64x64_nn
1.15% 21.216ms 938 22.618us 14.913us 23.233us volta_sgemm_128x32_nt
1.08% 19.931ms 1876 10.624us 6.7840us 12.321us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESB_ILi1EjENS0_6memory15LoadWithoutCastENSE_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
1.03% 18.878ms 938 20.126us 9.4090us 24.930us void gemv2T_kernel_val<int, int, float2, float2, float2, int=128, int=16, int=2, int=2, bool=0, cublasGemvParams<cublasGemvTensorBatched<float2 const >, cublasGemvTensorBatched<float2>, float2>>(float2 const , float2, float2)
0.54% 9.8585ms 3127 3.1520us 2.4640us 11.104us void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)
0.49% 9.0603ms 2190 4.1370us 3.3920us 9.2800us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjESC_NS0_6memory15LoadWithoutCastENSD_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
0.46% 8.4426ms 1096 7.7030us 6.9440us 22.625us volta_sgemm_32x32_sliced1x4_tn
0.36% 6.6202ms 938 7.0570us 4.8970us 8.8000us _ZN2at6native13reduce_kernelILi128ELi4ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
0.34% 6.3149ms 938 6.7320us 6.4320us 7.1370us volta_sgemm_32x32_sliced1x4_nt
0.30% 5.5124ms 938 5.8760us 4.6080us 8.0320us _ZN2at6native13reduce_kernelILi256ELi2ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
0.25% 4.6503ms 938 4.9570us 4.8330us 7.6160us volta_sgemm_32x128_nn
0.24% 4.4169ms 1876 2.3540us 2.1120us 11.073us _ZN2at6native92_GLOBAL__N__68_tmpxft_00000d67_00000000_11_DistributionBernoulli_compute_75_cpp1_ii_cb3dce8443distribution_elementwise_grid_stride_kernelIfLi4EZNS0_9templates4cuda21uniform_and_transformIffLm4EPNS_17CUDAGeneratorImplEZZZNS4_16bernoulli_kernelIS7_EEvRNS_14TensorIteratorEdT_ENKUlvE_clEvENKUlvE2_clEvEUlfE_EEvSA_T2_T3_EUlP24curandStatePhilox4_32_10E0_ZNS1_27distribution_nullary_kernelIffLi4ES7_SJ_SE_EEvSA_SF_RKSG_T4_EUlifE_EEviSt4pairImmET1_SF_
0.19% 3.5246ms 1876 1.8780us 1.4400us 8.7370us compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
0.19% 3.4728ms 1095 3.1710us 2.5600us 8.6410us void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)
0.17% 3.0794ms 1095 2.8120us 2.4320us 6.0480us void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const *, float const *, float*, float const *, float const *)
0.15% 2.7375ms 1876 1.4590us 1.1520us 7.3930us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
0.12% 2.2652ms 938 2.4140us 2.1120us 5.3130us void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)
0.11% 2.0554ms 1095 1.8770us 1.6960us 6.1760us void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_forward<float, float, float, int=4, bool=1>(float*, float const *, int, int, int)
0.09% 1.5966ms 938 1.7020us 1.4080us 7.4250us void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_backward<float, float, float, int=4, bool=1>(float*, float const *, float const , int, int, int)
0.08% 1.4663ms 946 1.5500us 1.0560us 6.2720us [CUDA memset]
0.04% 824.06us 157 5.2480us 5.0560us 6.3680us void at::native::reduce_kernel<int=512, int=1, at::native::ReduceOp<float, at::native::ArgMaxOps<float>, unsigned int, long, int=4>>(float)
0.04% 699.49us 157 4.4550us 4.1600us 4.5770us _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIlNS0_14func_wrapper_tIlZNS0_15sum_kernel_implIlllEEvRNS_14TensorIteratorEEUlllE_EEjlLi4EEEEEvT1_
0.04% 673.12us 408 1.6490us 1.0880us 6.7840us [CUDA memcpy DtoH]
0.02% 371.25us 157 2.3640us 1.9840us 3.1680us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE4_clEvEUllE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
0.01% 229.04us 157 1.4580us 1.2480us 2.4960us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14eq_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE4_clEvENKUlvE_clEvEUlllE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.00% 15.936us 1 15.936us 15.936us 15.936us volta_sgemm_128x32_nn
0.00% 9.6000us 1 9.6000us 9.6000us 9.6000us volta_scudnn_128x32_relu_interior_nn_v1
0.00% 1.3760us 1 1.3760us 1.3760us 1.3760us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Command:
Hardware:
1*GTX2080Ti
Cuda10.2
Cudnn7.6
Result:
O0(fp32 only): 14.21905 sec
O1(mixed, selected compute in fp16): 19.12530 sec
O2(almost fp16): 15.02236 sec
O3(fp16 only): 14.62414 sec