Skip to content

Instantly share code, notes, and snippets.

@dcslin
Created September 27, 2020 03:31
Show Gist options
  • Save dcslin/02ccf905f5082adb1009cddeec90af7d to your computer and use it in GitHub Desktop.
Save dcslin/02ccf905f5082adb1009cddeec90af7d to your computer and use it in GitHub Desktop.
benchmark pytorch MNIST mixed precision training by Apex
"""
This script is modified from https://github.com/pytorch/examples.git
"""
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from apex import amp
import time
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout2d(0.25)
self.dropout2 = nn.Dropout2d(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
def train(args, model, device, train_loader, optimizer, epoch):
start_time = time.time()
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
# loss.backward()
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
if args.dry_run:
break
end_time=time.time()
print("training used time %.5f sec" %(end_time-start_time))
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
def main():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=14, metavar='N',
help='number of epochs to train (default: 14)')
parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
help='learning rate (default: 1.0)')
parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
help='Learning rate step gamma (default: 0.7)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--dry-run', action='store_true', default=False,
help='quickly check a single pass')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--save-model', action='store_true', default=False,
help='For Saving the current Model')
parser.add_argument('--opt-level', type=str)
args = parser.parse_args()
use_cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {'batch_size': args.batch_size}
if use_cuda:
kwargs.update({'num_workers': 1,
'pin_memory': True,
'shuffle': True},
)
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
dataset1 = datasets.MNIST('../data', train=True, download=True,
transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **kwargs)
model = Net().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level)
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
for epoch in range(1, args.epochs + 1):
train(args, model, device, train_loader, optimizer, epoch)
test(model, device, test_loader)
scheduler.step()
if args.save_model:
torch.save(model.state_dict(), "mnist_cnn.pt")
if __name__ == '__main__':
main()
@dcslin
Copy link
Author

dcslin commented Sep 30, 2020

O0 fp32

Time(%)      Time     Calls       Avg       Min       Max  Name
  9.92%  182.58ms      3752  48.662us  8.6080us  97.381us  void fft2d_r2c_32x32<float, bool=0, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
  8.18%  150.54ms      3127  48.141us  18.305us  63.971us  void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)
  7.64%  140.53ms      6099  23.041us     928ns  53.347us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_21threshold_kernel_implIfEEvRNS_14TensorIteratorET_S5_EUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  7.27%  133.87ms      3127  42.812us  4.6080us  102.76us  void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
  6.41%  117.98ms      2189  53.897us  19.617us  77.732us  volta_sgemm_128x64_nn
  4.89%  89.924ms       938  95.867us  50.306us  97.829us  volta_sgemm_128x64_nt
  4.66%  85.714ms      1876  45.689us  12.193us  78.468us  void flip_filter<float, float>(float*, float const *, int, int, int, int)
  4.16%  76.545ms       938  81.604us  50.275us  85.412us  volta_cgemm_32x32_tn
  3.96%  72.832ms       938  77.645us  40.130us  78.980us  void at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_backward_nchw<float, float>(int, float const *, long const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_backward_nchw<float, float>*)
  3.89%  71.509ms     15000  4.7670us  1.0880us  27.841us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  3.49%  64.184ms     15008  4.2760us     928ns  27.490us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_19addcmul_cuda_kernelERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvENKUlvE_clEvEUlfffE_NS_6detail5ArrayIPcLi4EEEEEviT0_T1_
  3.29%  60.532ms      2190  27.640us  7.4560us  37.954us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESD_ILi1EjENS0_6memory15LoadWithoutCastENSG_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
  2.67%  49.128ms     15008  3.2730us     928ns  18.977us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUlfE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  2.67%  49.107ms     15008  3.2720us     928ns  20.097us  _ZN2at6native29vectorized_elementwise_kernelILi4EZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUlfE0_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  2.44%  44.844ms      1876  23.904us  10.176us  28.162us  _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
  2.17%  40.017ms     15008  2.6660us     992ns  19.361us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_16sqrt_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvENKUlvE_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  2.06%  37.948ms      1876  20.228us  8.6410us  32.129us  void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
  1.99%  36.579ms      9380  3.8990us     896ns  27.778us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  1.94%  35.756ms      1095  32.653us  4.6080us  33.570us  void at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_forward_nchw<float, float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00000d13_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::max_pool_forward_nchw<float, float>*, long*)
  1.91%  35.077ms      1094  32.063us  21.729us  35.426us  volta_sgemm_64x32_sliced1x4_tn
  1.90%  34.882ms      2204  15.826us     992ns  2.8076ms  [CUDA memcpy HtoD]
  1.88%  34.581ms      7504  4.6080us  1.0560us  27.009us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE0_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  1.81%  33.295ms      9388  3.5460us     768ns  18.049us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_NS_6detail5ArrayIPcLi1EEEEEviT0_T1_
  1.32%  24.357ms       937  25.994us  25.409us  27.201us  volta_sgemm_64x64_nn
  1.15%  21.216ms       938  22.618us  14.913us  23.233us  volta_sgemm_128x32_nt
  1.08%  19.931ms      1876  10.624us  6.7840us  12.321us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEE16OffsetCalculatorILi2EjESB_ILi1EjENS0_6memory15LoadWithoutCastENSE_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
  1.03%  18.878ms       938  20.126us  9.4090us  24.930us  void gemv2T_kernel_val<int, int, float2, float2, float2, int=128, int=16, int=2, int=2, bool=0, cublasGemvParams<cublasGemvTensorBatched<float2 const >, cublasGemvTensorBatched<float2>, float2>>(float2 const , float2, float2)
  0.54%  9.8585ms      3127  3.1520us  2.4640us  11.104us  void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)
  0.49%  9.0603ms      2190  4.1370us  3.3920us  9.2800us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE16OffsetCalculatorILi1EjESC_NS0_6memory15LoadWithoutCastENSD_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
  0.46%  8.4426ms      1096  7.7030us  6.9440us  22.625us  volta_sgemm_32x32_sliced1x4_tn
  0.36%  6.6202ms       938  7.0570us  4.8970us  8.8000us  _ZN2at6native13reduce_kernelILi128ELi4ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
  0.34%  6.3149ms       938  6.7320us  6.4320us  7.1370us  volta_sgemm_32x32_sliced1x4_nt
  0.30%  5.5124ms       938  5.8760us  4.6080us  8.0320us  _ZN2at6native13reduce_kernelILi256ELi2ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT1_
  0.25%  4.6503ms       938  4.9570us  4.8330us  7.6160us  volta_sgemm_32x128_nn
  0.24%  4.4169ms      1876  2.3540us  2.1120us  11.073us  _ZN2at6native92_GLOBAL__N__68_tmpxft_00000d67_00000000_11_DistributionBernoulli_compute_75_cpp1_ii_cb3dce8443distribution_elementwise_grid_stride_kernelIfLi4EZNS0_9templates4cuda21uniform_and_transformIffLm4EPNS_17CUDAGeneratorImplEZZZNS4_16bernoulli_kernelIS7_EEvRNS_14TensorIteratorEdT_ENKUlvE_clEvENKUlvE2_clEvEUlfE_EEvSA_T2_T3_EUlP24curandStatePhilox4_32_10E0_ZNS1_27distribution_nullary_kernelIffLi4ES7_SJ_SE_EEvSA_SF_RKSG_T4_EUlifE_EEviSt4pairImmET1_SF_
  0.19%  3.5246ms      1876  1.8780us  1.4400us  8.7370us  compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
  0.19%  3.4728ms      1095  3.1710us  2.5600us  8.6410us  void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)
  0.17%  3.0794ms      1095  2.8120us  2.4320us  6.0480us  void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const *, float const *, float*, float const *, float const *)
  0.15%  2.7375ms      1876  1.4590us  1.1520us  7.3930us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
  0.12%  2.2652ms       938  2.4140us  2.1120us  5.3130us  void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)
  0.11%  2.0554ms      1095  1.8770us  1.6960us  6.1760us  void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_forward<float, float, float, int=4, bool=1>(float*, float const *, int, int, int)
  0.09%  1.5966ms       938  1.7020us  1.4080us  7.4250us  void _GLOBAL__N__54_tmpxft_000019f7_00000000_11_SoftMax_compute_75_cpp1_ii_a3310042::softmax_warp_backward<float, float, float, int=4, bool=1>(float*, float const *, float const , int, int, int)
  0.08%  1.4663ms       946  1.5500us  1.0560us  6.2720us  [CUDA memset]
  0.04%  824.06us       157  5.2480us  5.0560us  6.3680us  void at::native::reduce_kernel<int=512, int=1, at::native::ReduceOp<float, at::native::ArgMaxOps<float>, unsigned int, long, int=4>>(float)
  0.04%  699.49us       157  4.4550us  4.1600us  4.5770us  _ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIlNS0_14func_wrapper_tIlZNS0_15sum_kernel_implIlllEEvRNS_14TensorIteratorEEUlllE_EEjlLi4EEEEEvT1_
  0.04%  673.12us       408  1.6490us  1.0880us  6.7840us  [CUDA memcpy DtoH]
  0.02%  371.25us       157  2.3640us  1.9840us  3.1680us  _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE0_clEvENKUlvE4_clEvEUllE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
  0.01%  229.04us       157  1.4580us  1.2480us  2.4960us  _ZN2at6native29vectorized_elementwise_kernelILi4EZZZZNS0_14eq_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE4_clEvENKUlvE_clEvEUlllE_NS_6detail5ArrayIPcLi3EEEEEviT0_T1_
  0.00%  15.936us         1  15.936us  15.936us  15.936us  volta_sgemm_128x32_nn
  0.00%  9.6000us         1  9.6000us  9.6000us  9.6000us  volta_scudnn_128x32_relu_interior_nn_v1
  0.00%  1.3760us         1  1.3760us  1.3760us  1.3760us  cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment