Created
September 27, 2020 03:31
-
-
Save dcslin/02ccf905f5082adb1009cddeec90af7d to your computer and use it in GitHub Desktop.
benchmark pytorch MNIST mixed precision training by Apex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script is modified from https://github.com/pytorch/examples.git | |
""" | |
from __future__ import print_function | |
import argparse | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import torch.optim as optim | |
from torchvision import datasets, transforms | |
from torch.optim.lr_scheduler import StepLR | |
from apex import amp | |
import time | |
class Net(nn.Module): | |
def __init__(self): | |
super(Net, self).__init__() | |
self.conv1 = nn.Conv2d(1, 32, 3, 1) | |
self.conv2 = nn.Conv2d(32, 64, 3, 1) | |
self.dropout1 = nn.Dropout2d(0.25) | |
self.dropout2 = nn.Dropout2d(0.5) | |
self.fc1 = nn.Linear(9216, 128) | |
self.fc2 = nn.Linear(128, 10) | |
def forward(self, x): | |
x = self.conv1(x) | |
x = F.relu(x) | |
x = self.conv2(x) | |
x = F.relu(x) | |
x = F.max_pool2d(x, 2) | |
x = self.dropout1(x) | |
x = torch.flatten(x, 1) | |
x = self.fc1(x) | |
x = F.relu(x) | |
x = self.dropout2(x) | |
x = self.fc2(x) | |
output = F.log_softmax(x, dim=1) | |
return output | |
def train(args, model, device, train_loader, optimizer, epoch): | |
start_time = time.time() | |
model.train() | |
for batch_idx, (data, target) in enumerate(train_loader): | |
data, target = data.to(device), target.to(device) | |
optimizer.zero_grad() | |
output = model(data) | |
loss = F.nll_loss(output, target) | |
# loss.backward() | |
with amp.scale_loss(loss, optimizer) as scaled_loss: | |
scaled_loss.backward() | |
optimizer.step() | |
if batch_idx % args.log_interval == 0: | |
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( | |
epoch, batch_idx * len(data), len(train_loader.dataset), | |
100. * batch_idx / len(train_loader), loss.item())) | |
if args.dry_run: | |
break | |
end_time=time.time() | |
print("training used time %.5f sec" %(end_time-start_time)) | |
def test(model, device, test_loader): | |
model.eval() | |
test_loss = 0 | |
correct = 0 | |
with torch.no_grad(): | |
for data, target in test_loader: | |
data, target = data.to(device), target.to(device) | |
output = model(data) | |
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss | |
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability | |
correct += pred.eq(target.view_as(pred)).sum().item() | |
test_loss /= len(test_loader.dataset) | |
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( | |
test_loss, correct, len(test_loader.dataset), | |
100. * correct / len(test_loader.dataset))) | |
def main(): | |
# Training settings | |
parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |
parser.add_argument('--batch-size', type=int, default=64, metavar='N', | |
help='input batch size for training (default: 64)') | |
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', | |
help='input batch size for testing (default: 1000)') | |
parser.add_argument('--epochs', type=int, default=14, metavar='N', | |
help='number of epochs to train (default: 14)') | |
parser.add_argument('--lr', type=float, default=1.0, metavar='LR', | |
help='learning rate (default: 1.0)') | |
parser.add_argument('--gamma', type=float, default=0.7, metavar='M', | |
help='Learning rate step gamma (default: 0.7)') | |
parser.add_argument('--no-cuda', action='store_true', default=False, | |
help='disables CUDA training') | |
parser.add_argument('--dry-run', action='store_true', default=False, | |
help='quickly check a single pass') | |
parser.add_argument('--seed', type=int, default=1, metavar='S', | |
help='random seed (default: 1)') | |
parser.add_argument('--log-interval', type=int, default=10, metavar='N', | |
help='how many batches to wait before logging training status') | |
parser.add_argument('--save-model', action='store_true', default=False, | |
help='For Saving the current Model') | |
parser.add_argument('--opt-level', type=str) | |
args = parser.parse_args() | |
use_cuda = not args.no_cuda and torch.cuda.is_available() | |
torch.manual_seed(args.seed) | |
device = torch.device("cuda" if use_cuda else "cpu") | |
kwargs = {'batch_size': args.batch_size} | |
if use_cuda: | |
kwargs.update({'num_workers': 1, | |
'pin_memory': True, | |
'shuffle': True}, | |
) | |
transform=transforms.Compose([ | |
transforms.ToTensor(), | |
transforms.Normalize((0.1307,), (0.3081,)) | |
]) | |
dataset1 = datasets.MNIST('../data', train=True, download=True, | |
transform=transform) | |
dataset2 = datasets.MNIST('../data', train=False, | |
transform=transform) | |
train_loader = torch.utils.data.DataLoader(dataset1,**kwargs) | |
test_loader = torch.utils.data.DataLoader(dataset2, **kwargs) | |
model = Net().to(device) | |
optimizer = optim.Adadelta(model.parameters(), lr=args.lr) | |
model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) | |
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) | |
for epoch in range(1, args.epochs + 1): | |
train(args, model, device, train_loader, optimizer, epoch) | |
test(model, device, test_loader) | |
scheduler.step() | |
if args.save_model: | |
torch.save(model.state_dict(), "mnist_cnn.pt") | |
if __name__ == '__main__': | |
main() |
O0 fp32
Time(%) Time Calls Avg Min Max Name
9.92% 182.58ms 3752 48.662us 8.6080us 97.381us void fft2d_r2c_32x32<float, bool=0, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
8.18% 150.54ms 3127 48.141us 18.305us 63.971us void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)
7.64% 140.53ms 6099 23.041us 928ns 53.347us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_21threshold_kernel_implIfEEvRNS_14TensorIteratorET_S5_EUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
7.27% 133.87ms 3127 42.812us 4.6080us 102.76us void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
6.41% 117.98ms 2189 53.897us 19.617us 77.732us volta_sgemm_128x64_nn
4.89% 89.924ms 938 95.867us 50.306us 97.829us volta_sgemm_128x64_nt
4.66% 85.714ms 1876 45.689us 12.193us 78.468us void flip_filter<float, float>(float*, float const *, int, int, int, int)
4.16% 76.545ms 938 81.604us 50.275us 85.412us volta_cgemm_32x32_tn
3.96% 72.832ms 938 77.645us 40.130us 78.980us void at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_backward_nchw<float, float>(int, float const *, long const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_backward_nchw<float, float>*)
3.89% 71.509ms 15000 4.7670us 1.0880us 27.841us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
3.49% 64.184ms 15008 4.2760us 928ns 27.490us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_19addcmul_cuda_kernelERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlfffE_NS_6detail5ArrayIPcLi4EEEEEviT0_T1_
3.29% 60.532ms 2190 27.640us 7.4560us 37.954us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESD_ILi1EjENS0_6memory15LoadWithoutCastENSG_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
2.67% 49.128ms 15008 3.2730us 928ns 18.977us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUlfE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
2.67% 49.107ms 15008 3.2720us 928ns 20.097us _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUlfE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
2.44% 44.844ms 1876 23.904us 10.176us 28.162us _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
2.17% 40.017ms 15008 2.6660us 992ns 19.361us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_16sqrt_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvENKUlvE_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
2.06% 37.948ms 1876 20.228us 8.6410us 32.129us void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
1.99% 36.579ms 9380 3.8990us 896ns 27.778us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
1.94% 35.756ms 1095 32.653us 4.6080us 33.570us void at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_forward_nchw<float, float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_forward_nchw<float, float>*, long*)
1.91% 35.077ms 1094 32.063us 21.729us 35.426us volta_sgemm_64x32_sliced1x4_tn
1.90% 34.882ms 2204 15.826us 992ns 2.8076ms [CUDA memcpy HtoD]
1.88% 34.581ms 7504 4.6080us 1.0560us 27.009us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE0_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
1.81% 33.295ms 9388 3.5460us 768ns 18.049us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
1.32% 24.357ms 937 25.994us 25.409us 27.201us volta_sgemm_64x64_nn
1.15% 21.216ms 938 22.618us 14.913us 23.233us volta_sgemm_128x32_nt
1.08% 19.931ms 1876 10.624us 6.7840us 12.321us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESB_ILi1EjENS0_6memory15LoadWithoutCastENSE_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
1.03% 18.878ms 938 20.126us 9.4090us 24.930us void gemv2T_kernel_val<int, int, float2, float2, float2, int=128, int=16, int=2, int=2, bool=0, cublasGemvParams<cublasGemvTensorBatched<float2 const >, cublasGemvTensorBatched<float2>, float2>>(float2 const , float2, float2)
0.54% 9.8585ms 3127 3.1520us 2.4640us 11.104us void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)
0.49% 9.0603ms 2190 4.1370us 3.3920us 9.2800us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjESC_NS0_6memory15LoadWithoutCastENSD_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
0.46% 8.4426ms 1096 7.7030us 6.9440us 22.625us volta_sgemm_32x32_sliced1x4_tn
0.36% 6.6202ms 938 7.0570us 4.8970us 8.8000us _ZN2at6native13reduce_kernelILi128ELi4ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
0.34% 6.3149ms 938 6.7320us 6.4320us 7.1370us volta_sgemm_32x32_sliced1x4_nt
0.30% 5.5124ms 938 5.8760us 4.6080us 8.0320us _ZN2at6native13reduce_kernelILi256ELi2ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
0.25% 4.6503ms 938 4.9570us 4.8330us 7.6160us volta_sgemm_32x128_nn
0.24% 4.4169ms 1876 2.3540us 2.1120us 11.073us _ZN2at6native92_GLOBAL__N__68_tmpxft_00000d67_00000000_11_DistributionBernoulli_compute_75_cpp1_ii_cb3dce8443distribution_elementwise_grid_stride_kernelIfLi4EZNS0_9templates4cuda21uniform_and_transformIffLm4EPNS_17CUDAGeneratorImplEZZZNS4_16bernoulli_kernelIS7_EEvRNS_14TensorIteratorEdT_ENKUlvE_clEvENKUlvE2_clEvEUlfE_EEvSA_T2_T3_EUlP24curandStatePhilox4_32_10E0_ZNS1_27distribution_nullary_kernelIffLi4ES7_SJ_SE_EEvSA_SF_RKSG_T4_EUlifE_EEviSt4pairImmET1_SF_
0.19% 3.5246ms 1876 1.8780us 1.4400us 8.7370us compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
0.19% 3.4728ms 1095 3.1710us 2.5600us 8.6410us void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)
0.17% 3.0794ms 1095 2.8120us 2.4320us 6.0480us void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const *, float const *, float*, float const *, float const *)
0.15% 2.7375ms 1876 1.4590us 1.1520us 7.3930us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
0.12% 2.2652ms 938 2.4140us 2.1120us 5.3130us void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)
0.11% 2.0554ms 1095 1.8770us 1.6960us 6.1760us void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_forward<float, float, float, int=4, bool=1>(float*, float const *, int, int, int)
0.09% 1.5966ms 938 1.7020us 1.4080us 7.4250us void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_backward<float, float, float, int=4, bool=1>(float*, float const *, float const , int, int, int)
0.08% 1.4663ms 946 1.5500us 1.0560us 6.2720us [CUDA memset]
0.04% 824.06us 157 5.2480us 5.0560us 6.3680us void at::native::reduce_kernel<int=512, int=1, at::native::ReduceOp<float, at::native::ArgMaxOps<float>, unsigned int, long, int=4>>(float)
0.04% 699.49us 157 4.4550us 4.1600us 4.5770us _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIlNS0_14func_wrapper_tIlZNS0_15sum_kernel_implIlllEEvRNS_14TensorIteratorEEUlllE_EEjlLi4EEEEEvT1_
0.04% 673.12us 408 1.6490us 1.0880us 6.7840us [CUDA memcpy DtoH]
0.02% 371.25us 157 2.3640us 1.9840us 3.1680us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE4_clEvEUllE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
0.01% 229.04us 157 1.4580us 1.2480us 2.4960us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14eq_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE4_clEvENKUlvE_clEvEUlllE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
0.00% 15.936us 1 15.936us 15.936us 15.936us volta_sgemm_128x32_nn
0.00% 9.6000us 1 9.6000us 9.6000us 9.6000us volta_scudnn_128x32_relu_interior_nn_v1
0.00% 1.3760us 1 1.3760us 1.3760us 1.3760us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
O3 - fp16