Skip to content

Instantly share code, notes, and snippets.

@koshian2
Created June 10, 2019 07:28
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save koshian2/20e613b9c81e3a62919e2f8a160d8906 to your computer and use it in GitHub Desktop.
Save koshian2/20e613b9c81e3a62919e2f8a160d8906 to your computer and use it in GitHub Desktop.
PyTorch Mixed Precision/FP16
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
# https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/fp16util.py
class tofp16(nn.Module):
"""
Utility module that implements::
def forward(self, input):
return input.half()
"""
def __init__(self):
super(tofp16, self).__init__()
def forward(self, input):
return input.half()
def BN_convert_float(module):
"""
Utility function for network_to_half().
Retained for legacy purposes.
"""
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
module.float()
for child in module.children():
BN_convert_float(child)
return module
def network_to_half(network):
"""
Convert model to half precision in a batchnorm-safe way.
Retained for legacy purposes. It is recommended to use FP16Model.
"""
return nn.Sequential(tofp16(), BN_convert_float(network.half()))
def convert_module(module, dtype):
"""
Converts a module's immediate parameters and buffers to dtype.
"""
for param in module.parameters(recurse=False):
if param is not None:
if param.data.dtype.is_floating_point:
param.data = param.data.to(dtype=dtype)
if param._grad is not None and param._grad.data.dtype.is_floating_point:
param._grad.data = param._grad.data.to(dtype=dtype)
for buf in module.buffers(recurse=False):
if buf is not None and buf.data.dtype.is_floating_point:
buf.data = buf.data.to(dtype=dtype)
def convert_network(network, dtype):
"""
Converts a network's parameters and buffers to dtype.
"""
for module in network.modules():
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
continue
convert_module(module, dtype)
if isinstance(module, torch.nn.RNNBase) or isinstance(module, torch.nn.modules.rnn.RNNBase):
module.flatten_parameters()
return network
class FP16Model(nn.Module):
"""
Convert model to half precision in a batchnorm-safe way.
"""
def __init__(self, network):
super(FP16Model, self).__init__()
self.network = convert_network(network, dtype=torch.half)
def forward(self, *inputs):
inputs = tuple(t.half() for t in inputs)
return self.network(*inputs)
def backwards_debug_hook(grad):
raise RuntimeError("master_params recieved a gradient in the backward pass!")
def prep_param_lists(model, flat_master=False):
"""
Creates a list of FP32 master parameters for a given model, as in
`Training Neural Networks with Mixed Precision: Real Examples`_.
Args:
model (torch.nn.Module): Existing Pytorch model
flat_master (bool, optional, default=False): Flatten the master parameters into a single tensor, as a performance optimization.
Returns:
A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`. ``master_params`` is a list of FP32 master gradients. If ``flat_master=True``, ``master_params`` will be a list with one element.
Example::
model_params, master_params = prep_param_lists(model)
.. warning::
Currently, if ``flat_master=True``, all the model's parameters must be the same type. If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
.. _`Training Neural Networks with Mixed Precision: Real Examples`:
http://on-demand.gputechconf.com/gtc/2018/video/S81012/
"""
model_params = [param for param in model.parameters() if param.requires_grad]
if flat_master:
# Give the user some more useful error messages
try:
# flatten_dense_tensors returns a contiguous flat array.
# http://pytorch.org/docs/master/_modules/torch/_utils.html
master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
except:
print("Error in prep_param_lists: model may contain a mixture of parameters "
"of different types. Use flat_master=False, or use F16_Optimizer.")
raise
master_params = torch.nn.Parameter(master_params)
master_params.requires_grad = True
# master_params.register_hook(backwards_debug_hook)
if master_params.grad is None:
master_params.grad = master_params.new(*master_params.size())
return model_params, [master_params]
else:
master_params = [param.clone().float().detach() for param in model_params]
for param in master_params:
param.requires_grad = True
return model_params, master_params
def model_grads_to_master_grads(model_params, master_params, flat_master=False):
"""
Copy model gradients to master gradients.
Args:
model_params: List of model parameters created by :func:`prep_param_lists`.
master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
"""
if flat_master:
# The flattening may incur one more deep copy than is necessary.
master_params[0].grad.data.copy_(
_flatten_dense_tensors([p.grad.data for p in model_params]))
else:
for model, master in zip(model_params, master_params):
if model.grad is not None:
if master.grad is None:
master.grad = Variable(master.data.new(*master.data.size()))
master.grad.data.copy_(model.grad.data)
else:
master.grad = None
def master_params_to_model_params(model_params, master_params, flat_master=False):
"""
Copy master parameters to model parameters.
Args:
model_params: List of model parameters created by :func:`prep_param_lists`.
master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
"""
if flat_master:
for model, master in zip(model_params,
_unflatten_dense_tensors(master_params[0].data, model_params)):
model.data.copy_(master)
else:
for model, master in zip(model_params, master_params):
model.data.copy_(master.data)
# Backward compatibility fixes
def to_python_float(t):
if hasattr(t, 'item'):
return t.item()
else:
return t[0]
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])
if TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
clip_grad_norm = torch.nn.utils.clip_grad_norm
else:
clip_grad_norm = torch.nn.utils.clip_grad_norm_
import torch
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from pytorch_models import Layer10CNN, WideResNet
import numpy as np
import datetime
import time
import pickle
import os
from fp16util import network_to_half
def dataloaders(batch_size):
# torchvisionの出力は[0, 1]
trans = transforms.Compose([
transforms.ToTensor()
])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=trans)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
shuffle=True, num_workers=1)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=trans)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
shuffle=False, num_workers=1)
# workers数のチューニング(ただ単にhalf()でFP16化)
# WRN 512 1GPU
# workers1:80s / workers4:83s / workers8=88s
# WRN 512 2GPU
# workers1:44s / workers4:47s / workers8=52s
# workersが1のほうが速度が出る
# fp16util.pyでFP16化すると
# WRN 512 1GPU
# workers1: 76s / workers4:78s
# WRN 512 2GPU
# workers1:40s / workers4:44s
return trainloader, testloader
def train(batch_size, network, use_device):
if network == 0:
model = Layer10CNN()
elif network == 1:
model = WideResNet()
device = "cuda"
torch.backends.cudnn.benchmark = True
model = model.cuda()
train_loader, test_loader = dataloaders(batch_size)
criterion = torch.nn.CrossEntropyLoss()
initial_lr = 0.1 * batch_size / 128
optimizer = optim.SGD(model.parameters(), lr=initial_lr, momentum=0.9)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 80], gamma=0.1)
if use_device == "multigpu":
model = torch.nn.DataParallel(model)
model = network_to_half(model)
result = {}
result["train_begin"] = datetime.datetime.now()
result["times"] = []
result["val_acc"] = []
result["loss"] = []
for epoch in range(100):
start_time = time.time()
# train
train_loss = 0.0
for i, (inputs, labels) in enumerate(train_loader):
inputs, labels = inputs.to(device), labels.to(device) # FP16
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
train_loss += loss.item()
train_loss /= i+1 # per batch loss
# Validation
with torch.no_grad():
correct, total = 0, 0
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
_, pred = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (pred == labels).sum().item()
val_acc = correct / total
# log
elapsed = time.time() - start_time
result["times"].append(elapsed)
result["loss"].append(train_loss)
result["val_acc"].append(val_acc)
print(f"Epoch {epoch+1} loss = {train_loss:.06} val_acc = {val_acc:.04} | {elapsed:0.4}s")
result["train_end"] = datetime.datetime.now()
with open(f"result/{use_device}_{network}_{batch_size}.pkl", "wb") as fp:
pickle.dump(result, fp)
def train_gpus():
for network in [0, 1]:
for device in ["gpu", "multigpu"]:
for batch in [128, 256, 512, 1024, 2048]:
if device == "gpu":
if network == 1 and batch > 512: continue
if device == "multigpu":
if network == 1 and batch > 1024: continue
train(batch, network, device)
time.sleep(60)
if __name__ == "__main__":
train_gpus()
import torch
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from pytorch_models import Layer10CNN, WideResNet
from apex import amp
import numpy as np
import datetime
import time
import pickle
def dataloaders(batch_size):
# torchvisionの出力は[0, 1]
trans = transforms.Compose([
transforms.ToTensor()
])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=trans)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
shuffle=True, num_workers=1)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=trans)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
shuffle=False, num_workers=1)
# workers数のチューニング
# WRN 512 1GPU
# 01 = workers1:78s / workers4:84s / workers8=90s
# WRN 512 2GPU
# 01 = workers1:46s / workers4:48s / workers8=55s
# workersが1のほうが速度が出る
return trainloader, testloader
def train(batch_size, network, use_device):
if network == 0:
model = Layer10CNN()
elif network == 1:
model = WideResNet()
device = "cuda"
torch.backends.cudnn.benchmark = True
model = model.cuda()
train_loader, test_loader = dataloaders(batch_size)
criterion = torch.nn.CrossEntropyLoss()
initial_lr = 0.1 * batch_size / 128
optimizer = optim.SGD(model.parameters(), lr=initial_lr, momentum=0.9)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 80], gamma=0.1)
# opt_level : 01=Mixed Precision(Recommend), 02=Almost FP16, 03=FP16
# 02, 03が複数GPUで動かない : issue
# WRN 1GPU 512 01=80s / 03=77s
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
if use_device == "multigpu":
model = torch.nn.DataParallel(model)
result = {}
result["train_begin"] = datetime.datetime.now()
result["times"] = []
result["val_acc"] = []
result["loss"] = []
for epoch in range(100):
start_time = time.time()
# train
train_loss = 0.0
for i, (inputs, labels) in enumerate(train_loader):
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
#loss.backward()
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
optimizer.step()
train_loss += loss.item()
train_loss /= i+1 # per batch loss
# Validation
with torch.no_grad():
correct, total = 0, 0
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
_, pred = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (pred == labels).sum().item()
val_acc = correct / total
# log
elapsed = time.time() - start_time
result["times"].append(elapsed)
result["loss"].append(train_loss)
result["val_acc"].append(val_acc)
print(f"Epoch {epoch+1} loss = {train_loss:.06} val_acc = {val_acc:.04} | {elapsed:0.4}s")
result["train_end"] = datetime.datetime.now()
with open(f"result/{use_device}_{network}_{batch_size}.pkl", "wb") as fp:
pickle.dump(result, fp)
def train_gpus():
for network in [0, 1]:
for device in ["gpu", "multigpu"]:
for batch in [128, 256, 512, 1024, 2048]:
if device == "gpu":
if network == 1 and batch > 512: continue
if device == "multigpu":
if network == 1 and batch > 1024: continue
train(batch, network, device)
time.sleep(60)
if __name__ == "__main__":
train_gpus()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment