Skip to content

Instantly share code, notes, and snippets.

@sam186
Last active October 11, 2017 18:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sam186/75e500452cbf9b863576e2d47c7733e7 to your computer and use it in GitHub Desktop.
Save sam186/75e500452cbf9b863576e2d47c7733e7 to your computer and use it in GitHub Desktop.
pytorch dataparallel hang
#0 0x00007ffff76c1827 in futex_abstimed_wait_cancelable (private=0, abstime=0x0, expected=0,
futex_word=0x7fff04000c10) at ../sysdeps/unix/sysv/linux/futex-internal.h:205
#1 do_futex_wait (sem=sem@entry=0x7fff04000c10, abstime=0x0) at sem_waitcommon.c:111
#2 0x00007ffff76c18d4 in __new_sem_wait_slow (sem=0x7fff04000c10, abstime=0x0) at sem_waitcommon.c:181
#3 0x00007ffff76c197a in __new_sem_wait (sem=<optimized out>) at sem_wait.c:29
#4 0x00007ffff7a61b33 in PyThread_acquire_lock_timed (lock=0x7fff04000c10, microseconds=-1000000, intr_flag=1)
at Python/thread_pthread.h:354
#5 0x00007ffff7a68804 in acquire_timed (lock=0x7fff04000c10, timeout=-1000000000) at ./Modules/_threadmodule.c:68
#6 0x00007ffff7a68946 in lock_PyThread_acquire_lock (self=0x7ffff6456418, args=<optimized out>,
kwds=<optimized out>) at ./Modules/_threadmodule.c:151
#7 0x00007ffff7992302 in _PyCFunction_FastCallDict (func_obj=0x7fffc6377cf0, args=0x7fffbc07baf8,
nargs=<optimized out>, kwargs=0x0) at Objects/methodobject.c:231
#8 0x00007ffff7a17b8c in call_function (pp_stack=0x7fffffffcb98, oparg=<optimized out>, kwnames=0x0)
at Python/ceval.c:4809
#9 0x00007ffff7a1ad40 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>)
at Python/ceval.c:3295
#10 0x00007ffff7a16100 in _PyEval_EvalCodeWithName (_co=0x7ffff0b311e0, globals=<optimized out>,
locals=<optimized out>, args=<optimized out>, argcount=1, kwnames=0x0, kwargs=0x7fffbc084730, kwcount=0,
kwstep=1, defs=0x7ffff0b26360, defcount=2, kwdefs=0x0, closure=0x0, name=0x7ffff0b27930,
qualname=0x7ffff0b30440) at Python/ceval.c:4139
#11 0x00007ffff7a17b2a in fast_function (kwnames=<optimized out>, nargs=1, stack=<optimized out>,
func=0x7ffff0b389d8) at Python/ceval.c:4950
#12 call_function (pp_stack=0x7fffffffce38, oparg=<optimized out>, kwnames=<optimized out>) at Python/ceval.c:4830
#13 0x00007ffff7a1ad40 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>)
at Python/ceval.c:3295
#14 0x00007ffff7a16100 in _PyEval_EvalCodeWithName (_co=0x7ffff0b31150, globals=<optimized out>,
locals=<optimized out>, args=<optimized out>, argcount=1, kwnames=0x0, kwargs=0x7fff0927a1c0, kwcount=0,
kwstep=1, defs=0x7ffff0b322c8, defcount=1, kwdefs=0x0, closure=0x0, name=0x7ffff7f96d18,
qualname=0x7ffff0b28c70) at Python/ceval.c:4139
#15 0x00007ffff7a17b2a in fast_function (kwnames=<optimized out>, nargs=1, stack=<optimized out>,
---Type <return> to continue, or q <return> to quit---
func=0x7ffff0b38950) at Python/ceval.c:4950
#16 call_function (pp_stack=0x7fffffffd0d8, oparg=<optimized out>, kwnames=<optimized out>) at Python/ceval.c:4830
#17 0x00007ffff7a1ad40 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>)
at Python/ceval.c:3295
#18 0x00007ffff7a16100 in _PyEval_EvalCodeWithName (_co=0x7fffc6383810, globals=<optimized out>,
locals=<optimized out>, args=<optimized out>, argcount=4, kwnames=0x0, kwargs=0x7fffbc07c5f8, kwcount=0,
kwstep=1, defs=0x7fffc66917e0, defcount=2, kwdefs=0x0, closure=0x0, name=0x7fffc6385a30,
qualname=0x7fffc6385a30) at Python/ceval.c:4139
#19 0x00007ffff7a17b2a in fast_function (kwnames=<optimized out>, nargs=4, stack=<optimized out>,
func=0x7fffc63888c8) at Python/ceval.c:4950
#20 call_function (pp_stack=0x7fffffffd378, oparg=<optimized out>, kwnames=<optimized out>) at Python/ceval.c:4830
#21 0x00007ffff7a1ad40 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>)
at Python/ceval.c:3295
#22 0x00007ffff7a15514 in _PyFunction_FastCall (co=<optimized out>, args=<optimized out>, nargs=4,
globals=<optimized out>) at Python/ceval.c:4891
#23 0x00007ffff7a17c88 in fast_function (kwnames=0x0, nargs=4, stack=<optimized out>, func=0x7fffc63a1620)
at Python/ceval.c:4926
#24 call_function (pp_stack=0x7fffffffd5a8, oparg=<optimized out>, kwnames=0x0) at Python/ceval.c:4830
#25 0x00007ffff7a1ad40 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>)
at Python/ceval.c:3295
#26 0x00007ffff7a16100 in _PyEval_EvalCodeWithName (_co=0x7fffc638da50, globals=<optimized out>,
locals=<optimized out>, args=<optimized out>, argcount=2, kwnames=0x7ffff7f91060, kwargs=0x7ffff7f91068,
kwcount=0, kwstep=2, defs=0x0, defcount=0, kwdefs=0x0, closure=0x0, name=0x7fffc67057a0,
qualname=0x7fffc6391df8) at Python/ceval.c:4139
#27 0x00007ffff7a1639c in _PyFunction_FastCallDict (func=0x7fffc63a1488, args=0x7fffffffd7e0, nargs=2,
kwargs=0x7ffff64433a8) at Python/ceval.c:5042
#28 0x00007ffff793ace6 in _PyObject_FastCallDict (func=0x7fffc63a1488, args=0x7fffffffd7e0, nargs=<optimized out>,
kwargs=0x7ffff64433a8) at Objects/abstract.c:2295
#29 0x00007ffff793af3c in _PyObject_Call_Prepend (func=0x7fffc63a1488, obj=0x7ffff6441f98, args=0x7ffff7e74fd0,
kwargs=0x7ffff64433a8) at Objects/abstract.c:2358
---Type <return> to continue, or q <return> to quit---
#30 0x00007ffff793afd6 in PyObject_Call (func=0x7ffff7f878c8, args=<optimized out>, kwargs=<optimized out>)
at Objects/abstract.c:2246
#31 0x00007ffff7a1bfc9 in do_call_core (kwdict=0x7ffff64433a8, callargs=<optimized out>, func=0x7ffff7f878c8)
at Python/ceval.c:5078
#32 _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at Python/ceval.c:3377
#33 0x00007ffff7a16100 in _PyEval_EvalCodeWithName (_co=0x7fffc654a660, globals=<optimized out>,
locals=<optimized out>, args=<optimized out>, argcount=2, kwnames=0x0, kwargs=0x8, kwcount=0, kwstep=2,
defs=0x0, defcount=0, kwdefs=0x0, closure=0x0, name=0x7ffff7f94170, qualname=0x7fffc66a1230)
at Python/ceval.c:4139
#34 0x00007ffff7a1639c in _PyFunction_FastCallDict (func=0x7fffc6430488, args=0x7fffffffdbd0, nargs=2, kwargs=0x0)
at Python/ceval.c:5042
#35 0x00007ffff793ace6 in _PyObject_FastCallDict (func=0x7fffc6430488, args=0x7fffffffdbd0, nargs=<optimized out>,
kwargs=0x0) at Objects/abstract.c:2295
#36 0x00007ffff793af3c in _PyObject_Call_Prepend (func=0x7fffc6430488, obj=0x7ffff6441f98, args=0x7ffff7e74ef0,
kwargs=0x0) at Objects/abstract.c:2358
#37 0x00007ffff793afd6 in PyObject_Call (func=0x7ffff7f87908, args=<optimized out>, kwargs=<optimized out>)
at Objects/abstract.c:2246
#38 0x00007ffff79b344f in slot_tp_call (self=0x7ffff6441f98, args=0x7ffff7e74ef0, kwds=0x0)
at Objects/typeobject.c:6194
#39 0x00007ffff793ac1e in _PyObject_FastCallDict (func=0x7ffff6441f98, args=<optimized out>, nargs=<optimized out>,
kwargs=0x0) at Objects/abstract.c:2316
#40 0x00007ffff7a1795b in call_function (pp_stack=0x7fffffffdec8, oparg=<optimized out>, kwnames=0x0)
at Python/ceval.c:4833
#41 0x00007ffff7a1ad40 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>)
at Python/ceval.c:3295
#42 0x00007ffff7a16100 in _PyEval_EvalCodeWithName (_co=0x7ffff7f028a0, globals=<optimized out>,
locals=<optimized out>, args=<optimized out>, argcount=0, kwnames=0x0, kwargs=0x8, kwcount=0, kwstep=2,
defs=0x0, defcount=0, kwdefs=0x0, closure=0x0, name=0x0, qualname=0x0) at Python/ceval.c:4139
#43 0x00007ffff7a16583 in PyEval_EvalCodeEx (_co=<optimized out>, globals=<optimized out>, locals=<optimized out>,
args=<optimized out>, argcount=<optimized out>, kws=<optimized out>, kwcount=0, defs=0x0, defcount=0,
---Type <return> to continue, or q <return> to quit---
kwdefs=0x0, closure=0x0) at Python/ceval.c:4160
#44 0x00007ffff7a165cb in PyEval_EvalCode (co=<optimized out>, globals=<optimized out>, locals=<optimized out>)
at Python/ceval.c:695
#45 0x00007ffff7a48ee0 in run_mod (arena=0x7ffff7f63270, flags=0x7fffffffe220, locals=0x7ffff7f45f78,
globals=0x7ffff7f45f78, filename=0x7ffff7e870a0, mod=0x6b1970) at Python/pythonrun.c:980
#46 PyRun_FileExFlags (fp=0x68c070, filename_str=<optimized out>, start=<optimized out>, globals=0x7ffff7f45f78,
locals=0x7ffff7f45f78, closeit=<optimized out>, flags=0x7fffffffe220) at Python/pythonrun.c:933
#47 0x00007ffff7a4a4a3 in PyRun_SimpleFileExFlags (fp=0x68c070, filename=<optimized out>, closeit=1,
flags=0x7fffffffe220) at Python/pythonrun.c:396
#48 0x00007ffff7a658d5 in run_file (p_cf=0x7fffffffe220, filename=0x6032c0 L"test.py", fp=0x68c070)
at Modules/main.c:338
#49 Py_Main (argc=<optimized out>, argv=<optimized out>) at Modules/main.c:810
#50 0x0000000000400c1d in main (argc=2, argv=<optimized out>) at ./Programs/python.c:69
from torch import nn
from torch.autograd import Variable
import torch
l = nn.Linear(5,5).cuda()
pl = nn.DataParallel(l)
print("Checkpoint 1")
a = Variable(torch.rand(5,5).cuda(), requires_grad=True)
print("Checkpoint 2")
print(pl(a)) # Here it gets stuck
print("Checkpoint 3")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment