Created
September 29, 2017 18:49
-
-
Save sam186/ce30eb715ab8197f6da24066db1c7cea to your computer and use it in GitHub Desktop.
nn.DataParallel(model).cuda() hang
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import argparse | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import torch.optim as optim | |
from torchvision import datasets, transforms | |
from torch.autograd import Variable | |
parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |
parser.add_argument('--batch-size', type=int, default=64, metavar='N', | |
help='input batch size for training (default: 64)') | |
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', | |
help='input batch size for testing (default: 1000)') | |
parser.add_argument('--epochs', type=int, default=10, metavar='N', | |
help='number of epochs to train (default: 10)') | |
parser.add_argument('--lr', type=float, default=0.01, metavar='LR', | |
help='learning rate (default: 0.01)') | |
parser.add_argument('--momentum', type=float, default=0.5, metavar='M', | |
help='SGD momentum (default: 0.5)') | |
parser.add_argument('--no-cuda', action='store_true', default=False, | |
help='disables CUDA training') | |
parser.add_argument('--seed', type=int, default=1, metavar='S', | |
help='random seed (default: 1)') | |
parser.add_argument('--log-interval', type=int, default=10, metavar='N', | |
help='how many batches to wait before logging training status') | |
args = parser.parse_args() | |
args.cuda = not args.no_cuda | |
torch.manual_seed(args.seed) | |
if args.cuda: | |
torch.cuda.manual_seed(args.seed) | |
kwargs = {'num_workers': 2, 'pin_memory': True} if args.cuda else {} | |
train_loader = torch.utils.data.DataLoader( | |
datasets.MNIST('../data', train=True, download=True, | |
transform=transforms.Compose([ | |
transforms.ToTensor(), | |
transforms.Normalize((0.1307,), (0.3081,)) | |
])), | |
batch_size=args.batch_size, shuffle=True, **kwargs) | |
test_loader = torch.utils.data.DataLoader( | |
datasets.MNIST('../data', train=False, transform=transforms.Compose([ | |
transforms.ToTensor(), | |
transforms.Normalize((0.1307,), (0.3081,)) | |
])), | |
batch_size=args.batch_size, shuffle=True, **kwargs) | |
class Net(nn.Module): | |
def __init__(self): | |
super(Net, self).__init__() | |
self.conv1 = nn.Conv2d(1, 10, kernel_size=5) | |
self.conv2 = nn.Conv2d(10, 20, kernel_size=5) | |
self.conv2_drop = nn.Dropout2d() | |
self.fc1 = nn.Linear(320, 50) | |
self.fc2 = nn.Linear(50, 10) | |
def forward(self, x): | |
x = F.relu(F.max_pool2d(self.conv1(x), 2)) | |
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) | |
x = x.view(-1, self.num_flat_features(x)) | |
x = F.relu(self.fc1(x)) | |
x = F.dropout(x, training=self.training) | |
x = self.fc2(x) | |
output = F.log_softmax(x) | |
return output | |
def num_flat_features(self, x): | |
size = x.size()[1:] # all dimensions except the batch dimension | |
num_features = 1 | |
for s in size: | |
num_features *= s | |
return num_features | |
model = Net() | |
if args.cuda: | |
model=torch.nn.DataParallel(model, device_ids=[0,1]).cuda() | |
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) | |
def train(epoch): | |
model.train() | |
for batch_idx, (data, target) in enumerate(train_loader): | |
if args.cuda: | |
data, target = data.cuda(), target.cuda() | |
data, target = torch.autograd.Variable(data), torch.autograd.Variable(target) | |
optimizer.zero_grad() | |
output = model(data) | |
loss = F.nll_loss(output, target) | |
loss.backward() | |
optimizer.step() | |
if batch_idx % args.log_interval == 0: | |
correct = 0 | |
pred = output.data.max(1)[1] # get the index of the max log-probability | |
correct += pred.eq(target.data).sum() | |
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)'.format( | |
epoch, batch_idx * len(data), len(train_loader.dataset), | |
100. * batch_idx / len(train_loader), loss.data[0], | |
correct, len(target), | |
100. * correct / len(target))) | |
def test(epoch): | |
model.eval() | |
test_loss = 0 | |
correct = 0 | |
for data, target in test_loader: | |
if args.cuda: | |
data, target = data.cuda(), target.cuda() | |
data, target = Variable(data, volatile=True), Variable(target) | |
output = model(data) | |
test_loss += F.nll_loss(output, target).data[0] | |
pred = output.data.max(1)[1] # get the index of the max log-probability | |
correct += pred.eq(target.data).cpu().sum() | |
test_loss = test_loss | |
test_loss /= len(test_loader) # loss function already averages over batch size | |
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( | |
test_loss, correct, len(test_loader.dataset), | |
100. * correct / len(test_loader.dataset))) | |
for epoch in range(1, args.epochs + 1): | |
train(epoch) | |
test(epoch) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#0 0x00007ffff7bc9827 in futex_abstimed_wait_cancelable (private=0, abstime=0x0, expected=0, futex_word=0x7fff94000c10) | |
at ../sysdeps/unix/sysv/linux/futex-internal.h:205 | |
#1 do_futex_wait (sem=sem@entry=0x7fff94000c10, abstime=0x0) at sem_waitcommon.c:111 | |
#2 0x00007ffff7bc98d4 in __new_sem_wait_slow (sem=0x7fff94000c10, abstime=0x0) at sem_waitcommon.c:181 | |
#3 0x00007ffff7bc997a in __new_sem_wait (sem=<optimized out>) at sem_wait.c:29 | |
#4 0x000055555563c516 in PyThread_acquire_lock_timed () | |
#5 0x00005555556d63ac in lock_PyThread_acquire_lock () | |
#6 0x00005555556616e4 in _PyCFunction_FastCallDict () | |
#7 0x00005555556ef4ec in call_function () | |
#8 0x0000555555713134 in _PyEval_EvalFrameDefault () | |
#9 0x00005555556e8346 in _PyEval_EvalCodeWithName () | |
#10 0x00005555556e93b1 in fast_function () | |
#11 0x00005555556ef5c5 in call_function () | |
#12 0x0000555555713134 in _PyEval_EvalFrameDefault () | |
#13 0x00005555556e8346 in _PyEval_EvalCodeWithName () | |
#14 0x00005555556e93b1 in fast_function () | |
#15 0x00005555556ef5c5 in call_function () | |
#16 0x0000555555713134 in _PyEval_EvalFrameDefault () | |
#17 0x00005555556e8867 in _PyEval_EvalCodeWithName () | |
#18 0x00005555556e93b1 in fast_function () | |
#19 0x00005555556ef5c5 in call_function () | |
#20 0x0000555555713134 in _PyEval_EvalFrameDefault () | |
#21 0x00005555556e917b in fast_function () | |
#22 0x00005555556ef5c5 in call_function () | |
#23 0x0000555555713134 in _PyEval_EvalFrameDefault () | |
#24 0x00005555556e8346 in _PyEval_EvalCodeWithName () | |
#25 0x00005555556e9b2d in _PyFunction_FastCallDict () | |
#26 0x0000555555661aaf in _PyObject_FastCallDict () | |
#27 0x0000555555666563 in _PyObject_Call_Prepend () | |
#28 0x00005555556614fe in PyObject_Call () | |
#29 0x00005555557148b4 in _PyEval_EvalFrameDefault () | |
#30 0x00005555556e8346 in _PyEval_EvalCodeWithName () | |
#31 0x00005555556e988f in _PyFunction_FastCallDict () | |
#32 0x0000555555661aaf in _PyObject_FastCallDict () | |
#33 0x0000555555666563 in _PyObject_Call_Prepend () | |
#34 0x00005555556614fe in PyObject_Call () | |
#35 0x00005555556c1787 in slot_tp_call () | |
#36 0x00005555556618cb in _PyObject_FastCallDict () | |
#37 0x00005555556ef63e in call_function () | |
#38 0x0000555555713134 in _PyEval_EvalFrameDefault () | |
#39 0x00005555556e917b in fast_function () | |
#40 0x00005555556ef5c5 in call_function () | |
#41 0x0000555555713134 in _PyEval_EvalFrameDefault () | |
#42 0x00005555556e9ee8 in PyEval_EvalCodeEx () | |
#43 0x00005555556eacac in PyEval_EvalCode () | |
#44 0x0000555555767ad4 in run_mod () | |
#45 0x0000555555767ed1 in PyRun_FileExFlags () | |
#46 0x00005555557680d4 in PyRun_SimpleFileExFlags () | |
#47 0x000055555576bb9f in Py_Main () | |
#48 0x000055555563371e in main () |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment