Created
June 10, 2019 07:28
-
-
Save koshian2/20e613b9c81e3a62919e2f8a160d8906 to your computer and use it in GitHub Desktop.
PyTorch Mixed Precision/FP16
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn as nn | |
from torch.autograd import Variable | |
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors | |
# https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/fp16util.py | |
class tofp16(nn.Module): | |
""" | |
Utility module that implements:: | |
def forward(self, input): | |
return input.half() | |
""" | |
def __init__(self): | |
super(tofp16, self).__init__() | |
def forward(self, input): | |
return input.half() | |
def BN_convert_float(module): | |
""" | |
Utility function for network_to_half(). | |
Retained for legacy purposes. | |
""" | |
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: | |
module.float() | |
for child in module.children(): | |
BN_convert_float(child) | |
return module | |
def network_to_half(network): | |
""" | |
Convert model to half precision in a batchnorm-safe way. | |
Retained for legacy purposes. It is recommended to use FP16Model. | |
""" | |
return nn.Sequential(tofp16(), BN_convert_float(network.half())) | |
def convert_module(module, dtype): | |
""" | |
Converts a module's immediate parameters and buffers to dtype. | |
""" | |
for param in module.parameters(recurse=False): | |
if param is not None: | |
if param.data.dtype.is_floating_point: | |
param.data = param.data.to(dtype=dtype) | |
if param._grad is not None and param._grad.data.dtype.is_floating_point: | |
param._grad.data = param._grad.data.to(dtype=dtype) | |
for buf in module.buffers(recurse=False): | |
if buf is not None and buf.data.dtype.is_floating_point: | |
buf.data = buf.data.to(dtype=dtype) | |
def convert_network(network, dtype): | |
""" | |
Converts a network's parameters and buffers to dtype. | |
""" | |
for module in network.modules(): | |
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: | |
continue | |
convert_module(module, dtype) | |
if isinstance(module, torch.nn.RNNBase) or isinstance(module, torch.nn.modules.rnn.RNNBase): | |
module.flatten_parameters() | |
return network | |
class FP16Model(nn.Module): | |
""" | |
Convert model to half precision in a batchnorm-safe way. | |
""" | |
def __init__(self, network): | |
super(FP16Model, self).__init__() | |
self.network = convert_network(network, dtype=torch.half) | |
def forward(self, *inputs): | |
inputs = tuple(t.half() for t in inputs) | |
return self.network(*inputs) | |
def backwards_debug_hook(grad): | |
raise RuntimeError("master_params recieved a gradient in the backward pass!") | |
def prep_param_lists(model, flat_master=False): | |
""" | |
Creates a list of FP32 master parameters for a given model, as in | |
`Training Neural Networks with Mixed Precision: Real Examples`_. | |
Args: | |
model (torch.nn.Module): Existing Pytorch model | |
flat_master (bool, optional, default=False): Flatten the master parameters into a single tensor, as a performance optimization. | |
Returns: | |
A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`. ``master_params`` is a list of FP32 master gradients. If ``flat_master=True``, ``master_params`` will be a list with one element. | |
Example:: | |
model_params, master_params = prep_param_lists(model) | |
.. warning:: | |
Currently, if ``flat_master=True``, all the model's parameters must be the same type. If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`. | |
.. _`Training Neural Networks with Mixed Precision: Real Examples`: | |
http://on-demand.gputechconf.com/gtc/2018/video/S81012/ | |
""" | |
model_params = [param for param in model.parameters() if param.requires_grad] | |
if flat_master: | |
# Give the user some more useful error messages | |
try: | |
# flatten_dense_tensors returns a contiguous flat array. | |
# http://pytorch.org/docs/master/_modules/torch/_utils.html | |
master_params = _flatten_dense_tensors([param.data for param in model_params]).float() | |
except: | |
print("Error in prep_param_lists: model may contain a mixture of parameters " | |
"of different types. Use flat_master=False, or use F16_Optimizer.") | |
raise | |
master_params = torch.nn.Parameter(master_params) | |
master_params.requires_grad = True | |
# master_params.register_hook(backwards_debug_hook) | |
if master_params.grad is None: | |
master_params.grad = master_params.new(*master_params.size()) | |
return model_params, [master_params] | |
else: | |
master_params = [param.clone().float().detach() for param in model_params] | |
for param in master_params: | |
param.requires_grad = True | |
return model_params, master_params | |
def model_grads_to_master_grads(model_params, master_params, flat_master=False): | |
""" | |
Copy model gradients to master gradients. | |
Args: | |
model_params: List of model parameters created by :func:`prep_param_lists`. | |
master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. | |
""" | |
if flat_master: | |
# The flattening may incur one more deep copy than is necessary. | |
master_params[0].grad.data.copy_( | |
_flatten_dense_tensors([p.grad.data for p in model_params])) | |
else: | |
for model, master in zip(model_params, master_params): | |
if model.grad is not None: | |
if master.grad is None: | |
master.grad = Variable(master.data.new(*master.data.size())) | |
master.grad.data.copy_(model.grad.data) | |
else: | |
master.grad = None | |
def master_params_to_model_params(model_params, master_params, flat_master=False): | |
""" | |
Copy master parameters to model parameters. | |
Args: | |
model_params: List of model parameters created by :func:`prep_param_lists`. | |
master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`. | |
""" | |
if flat_master: | |
for model, master in zip(model_params, | |
_unflatten_dense_tensors(master_params[0].data, model_params)): | |
model.data.copy_(master) | |
else: | |
for model, master in zip(model_params, master_params): | |
model.data.copy_(master.data) | |
# Backward compatibility fixes | |
def to_python_float(t): | |
if hasattr(t, 'item'): | |
return t.item() | |
else: | |
return t[0] | |
TORCH_MAJOR = int(torch.__version__.split('.')[0]) | |
TORCH_MINOR = int(torch.__version__.split('.')[1]) | |
if TORCH_MAJOR == 0 and TORCH_MINOR <= 4: | |
clip_grad_norm = torch.nn.utils.clip_grad_norm | |
else: | |
clip_grad_norm = torch.nn.utils.clip_grad_norm_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.optim as optim | |
import torchvision | |
import torchvision.transforms as transforms | |
from pytorch_models import Layer10CNN, WideResNet | |
import numpy as np | |
import datetime | |
import time | |
import pickle | |
import os | |
from fp16util import network_to_half | |
def dataloaders(batch_size): | |
# torchvisionの出力は[0, 1] | |
trans = transforms.Compose([ | |
transforms.ToTensor() | |
]) | |
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, | |
download=True, transform=trans) | |
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, | |
shuffle=True, num_workers=1) | |
testset = torchvision.datasets.CIFAR10(root='./data', train=False, | |
download=True, transform=trans) | |
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, | |
shuffle=False, num_workers=1) | |
# workers数のチューニング(ただ単にhalf()でFP16化) | |
# WRN 512 1GPU | |
# workers1:80s / workers4:83s / workers8=88s | |
# WRN 512 2GPU | |
# workers1:44s / workers4:47s / workers8=52s | |
# workersが1のほうが速度が出る | |
# fp16util.pyでFP16化すると | |
# WRN 512 1GPU | |
# workers1: 76s / workers4:78s | |
# WRN 512 2GPU | |
# workers1:40s / workers4:44s | |
return trainloader, testloader | |
def train(batch_size, network, use_device): | |
if network == 0: | |
model = Layer10CNN() | |
elif network == 1: | |
model = WideResNet() | |
device = "cuda" | |
torch.backends.cudnn.benchmark = True | |
model = model.cuda() | |
train_loader, test_loader = dataloaders(batch_size) | |
criterion = torch.nn.CrossEntropyLoss() | |
initial_lr = 0.1 * batch_size / 128 | |
optimizer = optim.SGD(model.parameters(), lr=initial_lr, momentum=0.9) | |
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 80], gamma=0.1) | |
if use_device == "multigpu": | |
model = torch.nn.DataParallel(model) | |
model = network_to_half(model) | |
result = {} | |
result["train_begin"] = datetime.datetime.now() | |
result["times"] = [] | |
result["val_acc"] = [] | |
result["loss"] = [] | |
for epoch in range(100): | |
start_time = time.time() | |
# train | |
train_loss = 0.0 | |
for i, (inputs, labels) in enumerate(train_loader): | |
inputs, labels = inputs.to(device), labels.to(device) # FP16 | |
optimizer.zero_grad() | |
outputs = model(inputs) | |
loss = criterion(outputs, labels) | |
loss.backward() | |
optimizer.step() | |
train_loss += loss.item() | |
train_loss /= i+1 # per batch loss | |
# Validation | |
with torch.no_grad(): | |
correct, total = 0, 0 | |
for inputs, labels in test_loader: | |
inputs, labels = inputs.to(device), labels.to(device) | |
outputs = model(inputs) | |
_, pred = torch.max(outputs.data, 1) | |
total += labels.size(0) | |
correct += (pred == labels).sum().item() | |
val_acc = correct / total | |
# log | |
elapsed = time.time() - start_time | |
result["times"].append(elapsed) | |
result["loss"].append(train_loss) | |
result["val_acc"].append(val_acc) | |
print(f"Epoch {epoch+1} loss = {train_loss:.06} val_acc = {val_acc:.04} | {elapsed:0.4}s") | |
result["train_end"] = datetime.datetime.now() | |
with open(f"result/{use_device}_{network}_{batch_size}.pkl", "wb") as fp: | |
pickle.dump(result, fp) | |
def train_gpus(): | |
for network in [0, 1]: | |
for device in ["gpu", "multigpu"]: | |
for batch in [128, 256, 512, 1024, 2048]: | |
if device == "gpu": | |
if network == 1 and batch > 512: continue | |
if device == "multigpu": | |
if network == 1 and batch > 1024: continue | |
train(batch, network, device) | |
time.sleep(60) | |
if __name__ == "__main__": | |
train_gpus() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.optim as optim | |
import torchvision | |
import torchvision.transforms as transforms | |
from pytorch_models import Layer10CNN, WideResNet | |
from apex import amp | |
import numpy as np | |
import datetime | |
import time | |
import pickle | |
def dataloaders(batch_size): | |
# torchvisionの出力は[0, 1] | |
trans = transforms.Compose([ | |
transforms.ToTensor() | |
]) | |
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, | |
download=True, transform=trans) | |
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, | |
shuffle=True, num_workers=1) | |
testset = torchvision.datasets.CIFAR10(root='./data', train=False, | |
download=True, transform=trans) | |
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, | |
shuffle=False, num_workers=1) | |
# workers数のチューニング | |
# WRN 512 1GPU | |
# 01 = workers1:78s / workers4:84s / workers8=90s | |
# WRN 512 2GPU | |
# 01 = workers1:46s / workers4:48s / workers8=55s | |
# workersが1のほうが速度が出る | |
return trainloader, testloader | |
def train(batch_size, network, use_device): | |
if network == 0: | |
model = Layer10CNN() | |
elif network == 1: | |
model = WideResNet() | |
device = "cuda" | |
torch.backends.cudnn.benchmark = True | |
model = model.cuda() | |
train_loader, test_loader = dataloaders(batch_size) | |
criterion = torch.nn.CrossEntropyLoss() | |
initial_lr = 0.1 * batch_size / 128 | |
optimizer = optim.SGD(model.parameters(), lr=initial_lr, momentum=0.9) | |
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 80], gamma=0.1) | |
# opt_level : 01=Mixed Precision(Recommend), 02=Almost FP16, 03=FP16 | |
# 02, 03が複数GPUで動かない : issue | |
# WRN 1GPU 512 01=80s / 03=77s | |
model, optimizer = amp.initialize(model, optimizer, opt_level="O1") | |
if use_device == "multigpu": | |
model = torch.nn.DataParallel(model) | |
result = {} | |
result["train_begin"] = datetime.datetime.now() | |
result["times"] = [] | |
result["val_acc"] = [] | |
result["loss"] = [] | |
for epoch in range(100): | |
start_time = time.time() | |
# train | |
train_loss = 0.0 | |
for i, (inputs, labels) in enumerate(train_loader): | |
inputs, labels = inputs.to(device), labels.to(device) | |
optimizer.zero_grad() | |
outputs = model(inputs) | |
loss = criterion(outputs, labels) | |
#loss.backward() | |
with amp.scale_loss(loss, optimizer) as scaled_loss: | |
scaled_loss.backward() | |
optimizer.step() | |
train_loss += loss.item() | |
train_loss /= i+1 # per batch loss | |
# Validation | |
with torch.no_grad(): | |
correct, total = 0, 0 | |
for inputs, labels in test_loader: | |
inputs, labels = inputs.to(device), labels.to(device) | |
outputs = model(inputs) | |
_, pred = torch.max(outputs.data, 1) | |
total += labels.size(0) | |
correct += (pred == labels).sum().item() | |
val_acc = correct / total | |
# log | |
elapsed = time.time() - start_time | |
result["times"].append(elapsed) | |
result["loss"].append(train_loss) | |
result["val_acc"].append(val_acc) | |
print(f"Epoch {epoch+1} loss = {train_loss:.06} val_acc = {val_acc:.04} | {elapsed:0.4}s") | |
result["train_end"] = datetime.datetime.now() | |
with open(f"result/{use_device}_{network}_{batch_size}.pkl", "wb") as fp: | |
pickle.dump(result, fp) | |
def train_gpus(): | |
for network in [0, 1]: | |
for device in ["gpu", "multigpu"]: | |
for batch in [128, 256, 512, 1024, 2048]: | |
if device == "gpu": | |
if network == 1 and batch > 512: continue | |
if device == "multigpu": | |
if network == 1 and batch > 1024: continue | |
train(batch, network, device) | |
time.sleep(60) | |
if __name__ == "__main__": | |
train_gpus() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment