Created
September 5, 2019 09:17
-
-
Save maxhgerlach/7e7ccbd16714c7a070156f88fce7d291 to your computer and use it in GitHub Desktop.
hvd.init() hangs with error message help-opal-shmem-mmap.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(gdb) set pagination off | |
(gdb) thread apply all bt | |
Thread 4 (Thread 0x7efdfdfab700 (LWP 330906)): | |
#0 0x00007efe181faa13 in epoll_wait () at ../sysdeps/unix/syscall-template.S:84 | |
#1 0x00007efd73f3b138 in epoll_dispatch (base=0x7efd280b78d0, tv=<optimized out>) at epoll.c:407 | |
#2 0x00007efd73f3e4ff in opal_libevent2022_event_base_loop (base=0x7efd280b78d0, flags=1) at event.c:1630 | |
#3 0x00007efd2dbdee9e in progress_engine () from /opt/openmpi/lib/openmpi/mca_pmix_pmix3x.so | |
#4 0x00007efe184c46ba in start_thread (arg=0x7efdfdfab700) at pthread_create.c:333 | |
#5 0x00007efe181fa41d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109 | |
Thread 3 (Thread 0x7efdfd7aa700 (LWP 330905)): | |
#0 0x00007efe181ee74d in poll () at ../sysdeps/unix/syscall-template.S:84 | |
#1 0x00007efd73f47db8 in poll (__timeout=<optimized out>, __nfds=1, __fds=0x7efd200008c0) at /usr/include/x86_64-linux-gnu/bits/poll2.h:46 | |
#2 poll_dispatch (base=0x7efd2805f880, tv=<optimized out>) at poll.c:165 | |
#3 0x00007efd73f3e4ff in opal_libevent2022_event_base_loop (base=0x7efd2805f880, flags=1) at event.c:1630 | |
#4 0x00007efd73ef988e in progress_engine () from /opt/openmpi/lib/libopen-pal.so.40 | |
#5 0x00007efe184c46ba in start_thread (arg=0x7efdfd7aa700) at pthread_create.c:333 | |
#6 0x00007efe181fa41d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109 | |
Thread 2 (Thread 0x7efdfcfa9700 (LWP 330904)): | |
#0 0x00007efe181ee74d in poll () at ../sysdeps/unix/syscall-template.S:84 | |
#1 0x00007efd73f47db8 in poll (__timeout=<optimized out>, __nfds=4, __fds=0x7efd282ca770) at /usr/include/x86_64-linux-gnu/bits/poll2.h:46 | |
#2 poll_dispatch (base=0x7efd28046e90, tv=<optimized out>) at poll.c:165 | |
#3 0x00007efd73f3e4ff in opal_libevent2022_event_base_loop (base=0x7efd28046e90, flags=2) at event.c:1630 | |
#4 0x00007efd73ef3ce8 in opal_progress () from /opt/openmpi/lib/libopen-pal.so.40 | |
#5 0x00007efd73efa475 in ompi_sync_wait_mt () from /opt/openmpi/lib/libopen-pal.so.40 | |
#6 0x00007efd744c0e70 in ompi_request_wait_completion () from /opt/openmpi/lib/libmpi.so.40 | |
#7 0x00007efd744c29a6 in ompi_comm_nextcid () from /opt/openmpi/lib/libmpi.so.40 | |
#8 0x00007efd744bd595 in ompi_comm_dup_with_info () from /opt/openmpi/lib/libmpi.so.40 | |
#9 0x00007efd744f58a3 in PMPI_Comm_dup () from /opt/openmpi/lib/libmpi.so.40 | |
#10 0x00007efd747e97f6 in horovod::common::(anonymous namespace)::BackgroundThreadLoop (state=..., ctx=...) at horovod/common/operations.cc:929 | |
#11 0x00007efda6206c80 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6 | |
#12 0x00007efe184c46ba in start_thread (arg=0x7efdfcfa9700) at pthread_create.c:333 | |
#13 0x00007efe181fa41d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109 | |
Thread 1 (Thread 0x7efe188dd700 (LWP 330376)): | |
#0 0x00007efe184cdc1d in nanosleep () at ../sysdeps/unix/syscall-template.S:84 | |
#1 0x00007efd747e2904 in std::this_thread::sleep_for<long, std::ratio<1l, 1000l> > (__rtime=...) at /usr/include/c++/5/thread:292 | |
#2 horovod::common::(anonymous namespace)::InitializeHorovodOnce (ranks=<optimized out>, nranks=<optimized out>) at horovod/common/operations.cc:1610 | |
#3 0x00007efe16baee40 in ffi_call_unix64 () from /usr/lib/x86_64-linux-gnu/libffi.so.6 | |
#4 0x00007efe16bae8ab in ffi_call () from /usr/lib/x86_64-linux-gnu/libffi.so.6 | |
#5 0x00007efe170943df in _call_function_pointer (argcount=2, resmem=0x7ffceeaf0360, restype=<optimized out>, atypes=<optimized out>, avalues=0x7ffceeaf0340, pProc=0x7efd747e29d0 <horovod::common::horovod_init(int const*, int)>, flags=4353) at /build/python2.7-iaXs16/python2.7-2.7.12/Modules/_ctypes/callproc.c:837 | |
#6 _ctypes_callproc (pProc=0x7efd747e29d0 <horovod::common::horovod_init(int const*, int)>, argtuple=<optimized out>, flags=4353, argtypes=0x0, restype=<_ctypes.PyCSimpleType at remote 0x2353630>, checker=0x0) at /build/python2.7-iaXs16/python2.7-2.7.12/Modules/_ctypes/callproc.c:1180 | |
#7 0x00007efe17098d82 in PyCFuncPtr_call.lto_priv.107 (self=self@entry=0x7efd703fdd50, inargs=inargs@entry=(<c_int_Array_0 at remote 0x7efd7037e950>, <c_int at remote 0x7efd6fab8950>), kwds=kwds@entry=0x0) at /build/python2.7-iaXs16/python2.7-2.7.12/Modules/_ctypes/_ctypes.c:3954 | |
#8 0x00000000004c166d in PyObject_Call (kw=0x0, arg=(<c_int_Array_0 at remote 0x7efd7037e950>, <c_int at remote 0x7efd6fab8950>), func=<_FuncPtr(__name__='horovod_init') at remote 0x7efd703fdd50>) at ../Objects/abstract.c:2546 | |
#9 do_call (nk=<optimized out>, na=<optimized out>, pp_stack=0x7ffceeaf05e0, func=<_FuncPtr(__name__='horovod_init') at remote 0x7efd703fdd50>) at ../Python/ceval.c:4567 | |
#10 call_function (oparg=<optimized out>, pp_stack=0x7ffceeaf05e0) at ../Python/ceval.c:4372 | |
#11 PyEval_EvalFrameEx () at ../Python/ceval.c:2987 | |
#12 0x00000000004b9b66 in PyEval_EvalCodeEx () at ../Python/ceval.c:3582 | |
#13 0x00000000004c17c6 in fast_function (nk=<optimized out>, na=<optimized out>, n=<optimized out>, pp_stack=0x7ffceeaf07e0, func=<function at remote 0x7efd7cb22a28>) at ../Python/ceval.c:4445 | |
#14 call_function (oparg=<optimized out>, pp_stack=0x7ffceeaf07e0) at ../Python/ceval.c:4370 | |
#15 PyEval_EvalFrameEx () at ../Python/ceval.c:2987 | |
#16 0x00000000004b9b66 in PyEval_EvalCodeEx () at ../Python/ceval.c:3582 | |
#17 0x00000000004c1f56 in fast_function (nk=<optimized out>, na=<optimized out>, n=0, pp_stack=0x7ffceeaf09e0, func=<function at remote 0x7efd73e83668>) at ../Python/ceval.c:4445 | |
#18 call_function (oparg=<optimized out>, pp_stack=0x7ffceeaf09e0) at ../Python/ceval.c:4370 | |
#19 PyEval_EvalFrameEx () at ../Python/ceval.c:2987 | |
#20 0x00000000004b9b66 in PyEval_EvalCodeEx () at ../Python/ceval.c:3582 | |
#21 0x00000000004c1f56 in fast_function (nk=<optimized out>, na=<optimized out>, n=0, pp_stack=0x7ffceeaf0be0, func=<function at remote 0x7efd70b81c08>) at ../Python/ceval.c:4445 | |
#22 call_function (oparg=<optimized out>, pp_stack=0x7ffceeaf0be0) at ../Python/ceval.c:4370 | |
#23 PyEval_EvalFrameEx () at ../Python/ceval.c:2987 | |
#24 0x00000000004b9b66 in PyEval_EvalCodeEx () at ../Python/ceval.c:3582 | |
#25 0x00000000004c1f56 in fast_function (nk=<optimized out>, na=<optimized out>, n=1, pp_stack=0x7ffceeaf0de0, func=<function at remote 0x7efd70b838c0>) at ../Python/ceval.c:4445 | |
#26 call_function (oparg=<optimized out>, pp_stack=0x7ffceeaf0de0) at ../Python/ceval.c:4370 | |
#27 PyEval_EvalFrameEx () at ../Python/ceval.c:2987 | |
#28 0x00000000004b9b66 in PyEval_EvalCodeEx () at ../Python/ceval.c:3582 | |
#29 0x00000000004eb69f in PyEval_EvalCode (locals={'defaultdict': <type at remote 0x92b3e0>, 'OrderedDict': <type at remote 0x213ae30>, 'Union': <_Union at remote 0x7efe175a6680>, 'random': <module at remote 0x7efe16fd8130>, 'absolute_import': <_Feature(mandatory=(3, 0, 0, 'alpha', 0), optional=(2, 5, 0, 'alpha', 1), compiler_flag=16384) at remote 0x7efe18790ab8>, 'subprocess': <module at remote 0x7efe16fe8478>, 'BinaryIO': <GenericMeta(__module__='typing', __enter__=<function at remote 0x7efe16fcd578>, __args__=None, __origin__=None, __tree_hash__=2261999, __parameters__=(), _abc_cache=<WeakSet(_remove=<function at remote 0x7efe16fcd5f0>, _pending_removals=[], _iterating=set([]), data=set([])) at remote 0x7efe16fc6f50>, _abc_generic_negative_cache_version=15, __abstractmethods__=frozenset(['__exit__', 'truncate', 'read', 'readlines', 'writable', 'flush', 'seekable', 'isatty', 'readline', 'seek', 'close', 'fileno', 'writelines', 'name', '__enter__', 'readable', 'write', 'mode', 'closed', 'tell']), write=<function at remote 0x7efe16fcd488>, _gorg=<...>, __slo...(truncated), globals={'defaultdict': <type at remote 0x92b3e0>, 'OrderedDict': <type at remote 0x213ae30>, 'Union': <_Union at remote 0x7efe175a6680>, 'random': <module at remote 0x7efe16fd8130>, 'absolute_import': <_Feature(mandatory=(3, 0, 0, 'alpha', 0), optional=(2, 5, 0, 'alpha', 1), compiler_flag=16384) at remote 0x7efe18790ab8>, 'subprocess': <module at remote 0x7efe16fe8478>, 'BinaryIO': <GenericMeta(__module__='typing', __enter__=<function at remote 0x7efe16fcd578>, __args__=None, __origin__=None, __tree_hash__=2261999, __parameters__=(), _abc_cache=<WeakSet(_remove=<function at remote 0x7efe16fcd5f0>, _pending_removals=[], _iterating=set([]), data=set([])) at remote 0x7efe16fc6f50>, _abc_generic_negative_cache_version=15, __abstractmethods__=frozenset(['__exit__', 'truncate', 'read', 'readlines', 'writable', 'flush', 'seekable', 'isatty', 'readline', 'seek', 'close', 'fileno', 'writelines', 'name', '__enter__', 'readable', 'write', 'mode', 'closed', 'tell']), write=<function at remote 0x7efe16fcd488>, _gorg=<...>, __slo...(truncated), co=0x7efe16a967b0) at ../Python/ceval.c:669 | |
#30 run_mod.lto_priv () at ../Python/pythonrun.c:1376 | |
#31 0x00000000004e58f2 in PyRun_FileExFlags () at ../Python/pythonrun.c:1362 | |
#32 0x00000000004e41a6 in PyRun_SimpleFileExFlags () at ../Python/pythonrun.c:948 | |
#33 0x00000000004938ce in Py_Main () at ../Modules/main.c:640 | |
#34 0x00007efe18113830 in __libc_start_main (main=0x493370 <main>, argc=7, argv=0x7ffceeaf1228, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=0x7ffceeaf1218) at ../csu/libc-start.c:291 | |
#35 0x0000000000493299 in _start () |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment