Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save maxhgerlach/7e7ccbd16714c7a070156f88fce7d291 to your computer and use it in GitHub Desktop.
Save maxhgerlach/7e7ccbd16714c7a070156f88fce7d291 to your computer and use it in GitHub Desktop.
hvd.init() hangs with error message help-opal-shmem-mmap.txt
(gdb) set pagination off
(gdb) thread apply all bt
Thread 4 (Thread 0x7efdfdfab700 (LWP 330906)):
#0 0x00007efe181faa13 in epoll_wait () at ../sysdeps/unix/syscall-template.S:84
#1 0x00007efd73f3b138 in epoll_dispatch (base=0x7efd280b78d0, tv=<optimized out>) at epoll.c:407
#2 0x00007efd73f3e4ff in opal_libevent2022_event_base_loop (base=0x7efd280b78d0, flags=1) at event.c:1630
#3 0x00007efd2dbdee9e in progress_engine () from /opt/openmpi/lib/openmpi/mca_pmix_pmix3x.so
#4 0x00007efe184c46ba in start_thread (arg=0x7efdfdfab700) at pthread_create.c:333
#5 0x00007efe181fa41d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
Thread 3 (Thread 0x7efdfd7aa700 (LWP 330905)):
#0 0x00007efe181ee74d in poll () at ../sysdeps/unix/syscall-template.S:84
#1 0x00007efd73f47db8 in poll (__timeout=<optimized out>, __nfds=1, __fds=0x7efd200008c0) at /usr/include/x86_64-linux-gnu/bits/poll2.h:46
#2 poll_dispatch (base=0x7efd2805f880, tv=<optimized out>) at poll.c:165
#3 0x00007efd73f3e4ff in opal_libevent2022_event_base_loop (base=0x7efd2805f880, flags=1) at event.c:1630
#4 0x00007efd73ef988e in progress_engine () from /opt/openmpi/lib/libopen-pal.so.40
#5 0x00007efe184c46ba in start_thread (arg=0x7efdfd7aa700) at pthread_create.c:333
#6 0x00007efe181fa41d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
Thread 2 (Thread 0x7efdfcfa9700 (LWP 330904)):
#0 0x00007efe181ee74d in poll () at ../sysdeps/unix/syscall-template.S:84
#1 0x00007efd73f47db8 in poll (__timeout=<optimized out>, __nfds=4, __fds=0x7efd282ca770) at /usr/include/x86_64-linux-gnu/bits/poll2.h:46
#2 poll_dispatch (base=0x7efd28046e90, tv=<optimized out>) at poll.c:165
#3 0x00007efd73f3e4ff in opal_libevent2022_event_base_loop (base=0x7efd28046e90, flags=2) at event.c:1630
#4 0x00007efd73ef3ce8 in opal_progress () from /opt/openmpi/lib/libopen-pal.so.40
#5 0x00007efd73efa475 in ompi_sync_wait_mt () from /opt/openmpi/lib/libopen-pal.so.40
#6 0x00007efd744c0e70 in ompi_request_wait_completion () from /opt/openmpi/lib/libmpi.so.40
#7 0x00007efd744c29a6 in ompi_comm_nextcid () from /opt/openmpi/lib/libmpi.so.40
#8 0x00007efd744bd595 in ompi_comm_dup_with_info () from /opt/openmpi/lib/libmpi.so.40
#9 0x00007efd744f58a3 in PMPI_Comm_dup () from /opt/openmpi/lib/libmpi.so.40
#10 0x00007efd747e97f6 in horovod::common::(anonymous namespace)::BackgroundThreadLoop (state=..., ctx=...) at horovod/common/operations.cc:929
#11 0x00007efda6206c80 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#12 0x00007efe184c46ba in start_thread (arg=0x7efdfcfa9700) at pthread_create.c:333
#13 0x00007efe181fa41d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
Thread 1 (Thread 0x7efe188dd700 (LWP 330376)):
#0 0x00007efe184cdc1d in nanosleep () at ../sysdeps/unix/syscall-template.S:84
#1 0x00007efd747e2904 in std::this_thread::sleep_for<long, std::ratio<1l, 1000l> > (__rtime=...) at /usr/include/c++/5/thread:292
#2 horovod::common::(anonymous namespace)::InitializeHorovodOnce (ranks=<optimized out>, nranks=<optimized out>) at horovod/common/operations.cc:1610
#3 0x00007efe16baee40 in ffi_call_unix64 () from /usr/lib/x86_64-linux-gnu/libffi.so.6
#4 0x00007efe16bae8ab in ffi_call () from /usr/lib/x86_64-linux-gnu/libffi.so.6
#5 0x00007efe170943df in _call_function_pointer (argcount=2, resmem=0x7ffceeaf0360, restype=<optimized out>, atypes=<optimized out>, avalues=0x7ffceeaf0340, pProc=0x7efd747e29d0 <horovod::common::horovod_init(int const*, int)>, flags=4353) at /build/python2.7-iaXs16/python2.7-2.7.12/Modules/_ctypes/callproc.c:837
#6 _ctypes_callproc (pProc=0x7efd747e29d0 <horovod::common::horovod_init(int const*, int)>, argtuple=<optimized out>, flags=4353, argtypes=0x0, restype=<_ctypes.PyCSimpleType at remote 0x2353630>, checker=0x0) at /build/python2.7-iaXs16/python2.7-2.7.12/Modules/_ctypes/callproc.c:1180
#7 0x00007efe17098d82 in PyCFuncPtr_call.lto_priv.107 (self=self@entry=0x7efd703fdd50, inargs=inargs@entry=(<c_int_Array_0 at remote 0x7efd7037e950>, <c_int at remote 0x7efd6fab8950>), kwds=kwds@entry=0x0) at /build/python2.7-iaXs16/python2.7-2.7.12/Modules/_ctypes/_ctypes.c:3954
#8 0x00000000004c166d in PyObject_Call (kw=0x0, arg=(<c_int_Array_0 at remote 0x7efd7037e950>, <c_int at remote 0x7efd6fab8950>), func=<_FuncPtr(__name__='horovod_init') at remote 0x7efd703fdd50>) at ../Objects/abstract.c:2546
#9 do_call (nk=<optimized out>, na=<optimized out>, pp_stack=0x7ffceeaf05e0, func=<_FuncPtr(__name__='horovod_init') at remote 0x7efd703fdd50>) at ../Python/ceval.c:4567
#10 call_function (oparg=<optimized out>, pp_stack=0x7ffceeaf05e0) at ../Python/ceval.c:4372
#11 PyEval_EvalFrameEx () at ../Python/ceval.c:2987
#12 0x00000000004b9b66 in PyEval_EvalCodeEx () at ../Python/ceval.c:3582
#13 0x00000000004c17c6 in fast_function (nk=<optimized out>, na=<optimized out>, n=<optimized out>, pp_stack=0x7ffceeaf07e0, func=<function at remote 0x7efd7cb22a28>) at ../Python/ceval.c:4445
#14 call_function (oparg=<optimized out>, pp_stack=0x7ffceeaf07e0) at ../Python/ceval.c:4370
#15 PyEval_EvalFrameEx () at ../Python/ceval.c:2987
#16 0x00000000004b9b66 in PyEval_EvalCodeEx () at ../Python/ceval.c:3582
#17 0x00000000004c1f56 in fast_function (nk=<optimized out>, na=<optimized out>, n=0, pp_stack=0x7ffceeaf09e0, func=<function at remote 0x7efd73e83668>) at ../Python/ceval.c:4445
#18 call_function (oparg=<optimized out>, pp_stack=0x7ffceeaf09e0) at ../Python/ceval.c:4370
#19 PyEval_EvalFrameEx () at ../Python/ceval.c:2987
#20 0x00000000004b9b66 in PyEval_EvalCodeEx () at ../Python/ceval.c:3582
#21 0x00000000004c1f56 in fast_function (nk=<optimized out>, na=<optimized out>, n=0, pp_stack=0x7ffceeaf0be0, func=<function at remote 0x7efd70b81c08>) at ../Python/ceval.c:4445
#22 call_function (oparg=<optimized out>, pp_stack=0x7ffceeaf0be0) at ../Python/ceval.c:4370
#23 PyEval_EvalFrameEx () at ../Python/ceval.c:2987
#24 0x00000000004b9b66 in PyEval_EvalCodeEx () at ../Python/ceval.c:3582
#25 0x00000000004c1f56 in fast_function (nk=<optimized out>, na=<optimized out>, n=1, pp_stack=0x7ffceeaf0de0, func=<function at remote 0x7efd70b838c0>) at ../Python/ceval.c:4445
#26 call_function (oparg=<optimized out>, pp_stack=0x7ffceeaf0de0) at ../Python/ceval.c:4370
#27 PyEval_EvalFrameEx () at ../Python/ceval.c:2987
#28 0x00000000004b9b66 in PyEval_EvalCodeEx () at ../Python/ceval.c:3582
#29 0x00000000004eb69f in PyEval_EvalCode (locals={'defaultdict': <type at remote 0x92b3e0>, 'OrderedDict': <type at remote 0x213ae30>, 'Union': <_Union at remote 0x7efe175a6680>, 'random': <module at remote 0x7efe16fd8130>, 'absolute_import': <_Feature(mandatory=(3, 0, 0, 'alpha', 0), optional=(2, 5, 0, 'alpha', 1), compiler_flag=16384) at remote 0x7efe18790ab8>, 'subprocess': <module at remote 0x7efe16fe8478>, 'BinaryIO': <GenericMeta(__module__='typing', __enter__=<function at remote 0x7efe16fcd578>, __args__=None, __origin__=None, __tree_hash__=2261999, __parameters__=(), _abc_cache=<WeakSet(_remove=<function at remote 0x7efe16fcd5f0>, _pending_removals=[], _iterating=set([]), data=set([])) at remote 0x7efe16fc6f50>, _abc_generic_negative_cache_version=15, __abstractmethods__=frozenset(['__exit__', 'truncate', 'read', 'readlines', 'writable', 'flush', 'seekable', 'isatty', 'readline', 'seek', 'close', 'fileno', 'writelines', 'name', '__enter__', 'readable', 'write', 'mode', 'closed', 'tell']), write=<function at remote 0x7efe16fcd488>, _gorg=<...>, __slo...(truncated), globals={'defaultdict': <type at remote 0x92b3e0>, 'OrderedDict': <type at remote 0x213ae30>, 'Union': <_Union at remote 0x7efe175a6680>, 'random': <module at remote 0x7efe16fd8130>, 'absolute_import': <_Feature(mandatory=(3, 0, 0, 'alpha', 0), optional=(2, 5, 0, 'alpha', 1), compiler_flag=16384) at remote 0x7efe18790ab8>, 'subprocess': <module at remote 0x7efe16fe8478>, 'BinaryIO': <GenericMeta(__module__='typing', __enter__=<function at remote 0x7efe16fcd578>, __args__=None, __origin__=None, __tree_hash__=2261999, __parameters__=(), _abc_cache=<WeakSet(_remove=<function at remote 0x7efe16fcd5f0>, _pending_removals=[], _iterating=set([]), data=set([])) at remote 0x7efe16fc6f50>, _abc_generic_negative_cache_version=15, __abstractmethods__=frozenset(['__exit__', 'truncate', 'read', 'readlines', 'writable', 'flush', 'seekable', 'isatty', 'readline', 'seek', 'close', 'fileno', 'writelines', 'name', '__enter__', 'readable', 'write', 'mode', 'closed', 'tell']), write=<function at remote 0x7efe16fcd488>, _gorg=<...>, __slo...(truncated), co=0x7efe16a967b0) at ../Python/ceval.c:669
#30 run_mod.lto_priv () at ../Python/pythonrun.c:1376
#31 0x00000000004e58f2 in PyRun_FileExFlags () at ../Python/pythonrun.c:1362
#32 0x00000000004e41a6 in PyRun_SimpleFileExFlags () at ../Python/pythonrun.c:948
#33 0x00000000004938ce in Py_Main () at ../Modules/main.c:640
#34 0x00007efe18113830 in __libc_start_main (main=0x493370 <main>, argc=7, argv=0x7ffceeaf1228, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=0x7ffceeaf1218) at ../csu/libc-start.c:291
#35 0x0000000000493299 in _start ()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment