Created
June 12, 2020 22:34
-
-
Save zhengyang92/446b5c802069ac8435b112719a32272b to your computer and use it in GitHub Desktop.
allgather_crash
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Allgather ACCL ba46a1d Fri Jun 12 15:27:29 PDT 2020 | |
Allgather ACCL ba46a1d Fri Jun 12 15:27:29 PDT 2020 | |
Allgather ACCL ba46a1d Fri Jun 12 15:27:29 PDT 2020 | |
Allgather ACCL ba46a1d Fri Jun 12 15:27:29 PDT 2020 | |
# OSU MPI-CUDA Allgather Latency Test v5.6.2 | |
# Size Avg Latency(us) | |
1 59.12 | |
2 58.09 | |
4 56.24 | |
8 55.10 | |
16 55.11 | |
32 55.29 | |
64 54.81 | |
128 53.57 | |
256 53.22 | |
512 53.45 | |
1024 53.86 | |
2048 55.12 | |
4096 58.09 | |
[gcrsandbox102:38337:0:38337] cuda_ipc_cache.c:154 Fatal: dest:38339: failed to open ipc mem handle. addr:0x7f7803c00000 len:4194304 (Element already exists) | |
==== backtrace ==== | |
[gcrsandbox102:38339:0:38339] cuda_ipc_cache.c:154 Fatal: dest:38337: failed to open ipc mem handle. addr:0x7f281dc00000 len:4194304 (Element already exists) | |
==== backtrace ==== | |
[gcrsandbox102:38338:0:38338] cuda_ipc_cache.c:154 Fatal: dest:38337: failed to open ipc mem handle. addr:0x7f281dc00000 len:4194304 (Element already exists) | |
==== backtrace ==== | |
[gcrsandbox102:38340:0:38340] cuda_ipc_cache.c:154 Fatal: dest:38337: failed to open ipc mem handle. addr:0x7f281dc00000 len:4194304 (Element already exists) | |
==== backtrace ==== | |
0 /usr/local/lib/libucs.so.0(ucs_fatal_error_message+0xa1) [0x7f77e8833291] | |
1 /usr/local/lib/libucs.so.0(+0x1d385) [0x7f77e8833385] | |
2 /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_cache_map_memhandle+0x693) [0x7f77e39f0993] | |
3 /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_ep_put_zcopy+0x9d) [0x7f77e39efebd] | |
4 /usr/local/lib/libucp.so.0(ucp_rndv_progress_rma_put_zcopy+0x1c2) [0x7f77e8a77742] | |
5 /usr/local/lib/libucp.so.0(ucp_rndv_rtr_handler+0x180) [0x7f77e8a78be0] | |
6 /usr/local/lib/libuct.so.0(uct_mm_iface_progress+0x14d) [0x7f77e83ea70d] | |
7 /usr/local/lib/libucp.so.0(ucp_worker_progress+0x5a) [0x7f77e8a692ba] | |
8 /usr/local/lib/openmpi/mca_pml_ucx.so(mca_pml_ucx_progress+0x17) [0x7f77e9b992e7] | |
0 /usr/local/lib/libucs.so.0(ucs_fatal_error_message+0xa1) [0x7f2807164291] | |
1 /usr/local/lib/libucs.so.0(+0x1d385) [0x7f2807164385] | |
2 /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_cache_map_memhandle+0x693) [0x7f28064f3993] | |
3 /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_ep_put_zcopy+0x9d) [0x7f28064f2ebd] | |
4 /usr/local/lib/libucp.so.0(ucp_rndv_progress_rma_put_zcopy+0x1c2) [0x7f28073a8742] | |
5 /usr/local/lib/libucp.so.0(ucp_rndv_rtr_handler+0x180) [0x7f28073a9be0] | |
6 /usr/local/lib/libuct.so.0(uct_mm_iface_progress+0x14d) [0x7f2806d1b70d] | |
7 /usr/local/lib/libucp.so.0(ucp_worker_progress+0x5a) [0x7f280739a2ba] | |
8 /usr/local/lib/openmpi/mca_pml_ucx.so(mca_pml_ucx_progress+0x17) [0x7f280c59e2e7] | |
9 /usr/local/lib/libopen-pal.so.40(opal_progress+0x2c) [0x7f2846c41cec] | |
10 /usr/local/lib/libmpi.so.40(ompi_request_default_wait+0x4d) [0x7f2848f53f2d] | |
11 /usr/local/lib/libmpi.so.40(PMPI_Wait+0x52) [0x7f2848f9a542] | |
12 /home/t-liuzhe/work/collcc/out/lib/libaccl.so(ACCL_Allgather+0x3bd) [0x7f28494b671d] | |
0 /usr/local/lib/libucs.so.0(ucs_fatal_error_message+0xa1) [0x7f5ff1238291] | |
1 /usr/local/lib/libucs.so.0(+0x1d385) [0x7f5ff1238385] | |
2 /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_cache_map_memhandle+0x693) [0x7f5ff05c7993] | |
3 /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_ep_put_zcopy+0x9d) [0x7f5ff05c6ebd] | |
4 /usr/local/lib/libucp.so.0(ucp_rndv_progress_rma_put_zcopy+0x1c2) [0x7f5ff147c742] | |
5 /usr/local/lib/libucp.so.0(ucp_rndv_rtr_handler+0x180) [0x7f5ff147dbe0] | |
6 /usr/local/lib/libuct.so.0(uct_mm_iface_progress+0x14d) [0x7f5ff0def70d] | |
7 /usr/local/lib/libucp.so.0(ucp_worker_progress+0x5a) [0x7f5ff146e2ba] | |
8 /usr/local/lib/openmpi/mca_pml_ucx.so(mca_pml_ucx_progress+0x17) [0x7f5ff259e2e7] | |
9 /usr/local/lib/libopen-pal.so.40(opal_progress+0x2c) [0x7f6030c2bcec] | |
10 /usr/local/lib/libmpi.so.40(ompi_request_default_wait+0x4d) [0x7f6032f3df2d] | |
11 /usr/local/lib/libmpi.so.40(PMPI_Wait+0x52) [0x7f6032f84542] | |
12 /home/t-liuzhe/work/collcc/out/lib/libaccl.so(ACCL_Allgather+0x20f) [0x7f60334a056f] | |
13 ./osu_allgather_accl(+0x250b) [0x55d10fafa50b] | |
14 /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7f2848b35b97] | |
15 ./osu_allgather_accl(+0x299a) [0x55d10fafa99a] | |
=================== | |
[gcrsandbox102:38337] *** Process received signal *** | |
9 /usr/local/lib/libopen-pal.so.40(opal_progress+0x2c) [0x7f782838dcec] | |
10 /usr/local/lib/libmpi.so.40(ompi_request_default_wait+0x4d) [0x7f782a69ff2d] | |
11 /usr/local/lib/libmpi.so.40(PMPI_Wait+0x52) [0x7f782a6e6542] | |
12 /home/t-liuzhe/work/collcc/out/lib/libaccl.so(ACCL_Allgather+0x6a9) [0x7f782ac02a09] | |
13 ./osu_allgather_accl(+0x250b) [0x55d094cd450b] | |
14 /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7f782a281b97] | |
15 ./osu_allgather_accl(+0x299a) [0x55d094cd499a] | |
=================== | |
[gcrsandbox102:38339] *** Process received signal *** | |
0 /usr/local/lib/libucs.so.0(ucs_fatal_error_message+0xa1) [0x7fd73c432291] | |
1 /usr/local/lib/libucs.so.0(+0x1d385) [0x7fd73c432385] | |
2 /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_cache_map_memhandle+0x693) [0x7fd7375c6993] | |
3 /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_ep_put_zcopy+0x9d) [0x7fd7375c5ebd] | |
4 /usr/local/lib/libucp.so.0(ucp_rndv_progress_rma_put_zcopy+0x1c2) [0x7fd73c676742] | |
5 /usr/local/lib/libucp.so.0(ucp_rndv_rtr_handler+0x180) [0x7fd73c677be0] | |
6 /usr/local/lib/libuct.so.0(uct_mm_iface_progress+0x14d) [0x7fd737dee70d] | |
7 /usr/local/lib/libucp.so.0(ucp_worker_progress+0x5a) [0x7fd73c6682ba] | |
8 /usr/local/lib/openmpi/mca_pml_ucx.so(mca_pml_ucx_progress+0x17) [0x7fd73d7982e7] | |
9 /usr/local/lib/libopen-pal.so.40(opal_progress+0x2c) [0x7fd77be87cec] | |
10 /usr/local/lib/libmpi.so.40(ompi_request_default_wait+0x4d) [0x7fd77e199f2d] | |
11 /usr/local/lib/libmpi.so.40(PMPI_Wait+0x52) [0x7fd77e1e0542] | |
12 /home/t-liuzhe/work/collcc/out/lib/libaccl.so(ACCL_Allgather+0x534) [0x7fd77e6fc894] | |
13 ./osu_allgather_accl(+0x250b) [0x56406932250b] | |
14 /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7fd77dd7bb97] | |
15 ./osu_allgather_accl(+0x299a) [0x56406932299a] | |
=================== | |
[gcrsandbox102:38338] *** Process received signal *** | |
13 ./osu_allgather_accl(+0x250b) [0x55d1888eb50b] | |
14 /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7f6032b1fb97] | |
15 ./osu_allgather_accl(+0x299a) [0x55d1888eb99a] | |
=================== | |
[gcrsandbox102:38340] *** Process received signal *** | |
[gcrsandbox102:38340] Signal: Aborted (6) | |
[gcrsandbox102:38340] Signal code: (-6) | |
[gcrsandbox102:38338] Signal: Aborted (6) | |
[gcrsandbox102:38338] Signal code: (-6) | |
[gcrsandbox102:38337] Signal: Aborted (6) | |
[gcrsandbox102:38337] Signal code: (-6) | |
[gcrsandbox102:38337] [ 0] [gcrsandbox102:38339] Signal: Aborted (6) | |
[gcrsandbox102:38339] Signal code: (-6) | |
[gcrsandbox102:38339] [ 0] [gcrsandbox102:38340] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x3ef20)[0x7f6032b3cf20] | |
[gcrsandbox102:38340] [ 1] /lib/x86_64-linux-gnu/libc.so.6(gsignal+0xc7)[0x7f6032b3ce97] | |
[gcrsandbox102:38340] [ 2] [gcrsandbox102:38338] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x3ef20)[0x7fd77dd98f20] | |
[gcrsandbox102:38338] [ 1] /lib/x86_64-linux-gnu/libc.so.6(gsignal+0xc7)[0x7fd77dd98e97] | |
[gcrsandbox102:38338] [ 2] /lib/x86_64-linux-gnu/libc.so.6(+0x3ef20)[0x7f2848b52f20] | |
[gcrsandbox102:38337] [ 1] /lib/x86_64-linux-gnu/libc.so.6(gsignal+0xc7)[0x7f2848b52e97] | |
[gcrsandbox102:38337] [ 2] /lib/x86_64-linux-gnu/libc.so.6(abort+0x141)[0x7f2848b54801] | |
[gcrsandbox102:38337] [ 3] /usr/local/lib/libucs.so.0(+0x1d296)[0x7f2807164296] | |
[gcrsandbox102:38337] [ 4] /usr/local/lib/libucs.so.0(+0x1d385)[0x7f2807164385] | |
[gcrsandbox102:38337] /lib/x86_64-linux-gnu/libc.so.6(+0x3ef20)[0x7f782a29ef20] | |
[gcrsandbox102:38339] [ 1] /lib/x86_64-linux-gnu/libc.so.6(gsignal+0xc7)[0x7f782a29ee97] | |
[gcrsandbox102:38339] [ 2] /lib/x86_64-linux-gnu/libc.so.6(abort+0x141)[0x7f782a2a0801] | |
[gcrsandbox102:38339] [ 3] /usr/local/lib/libucs.so.0(+0x1d296)[0x7f77e8833296] | |
[gcrsandbox102:38339] [ 4] /usr/local/lib/libucs.so.0(+0x1d385)[0x7f77e8833385] | |
[gcrsandbox102:38339] [ 5] /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_cache_map_memhandle+0x693)[0x7f77e39f0993] | |
[gcrsandbox102:38339] [ 6] [ 5] /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_cache_map_memhandle+0x693)[0x7f28064f3993] | |
[gcrsandbox102:38337] [ 6] /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_ep_put_zcopy+0x9d)[0x7f28064f2ebd] | |
[gcrsandbox102:38337] [ 7] /usr/local/lib/libucp.so.0(ucp_rndv_progress_rma_put_zcopy+0x1c2)[0x7f28073a8742] | |
[gcrsandbox102:38337] [ 8] /usr/local/lib/libucp.so.0(ucp_rndv_rtr_handler+0x180)[0x7f28073a9be0] | |
[gcrsandbox102:38337] [ 9] /usr/local/lib/libuct.so.0(uct_mm_iface_progress+0x14d)[0x7f2806d1b70d] | |
[gcrsandbox102:38337] [10] /lib/x86_64-linux-gnu/libc.so.6(abort+0x141)[0x7f6032b3e801] | |
[gcrsandbox102:38340] [ 3] /usr/local/lib/libucs.so.0(+0x1d296)[0x7f5ff1238296] | |
[gcrsandbox102:38340] [ 4] /usr/local/lib/libucs.so.0(+0x1d385)[0x7f5ff1238385] | |
[gcrsandbox102:38340] [ 5] /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_cache_map_memhandle+0x693)[0x7f5ff05c7993] | |
[gcrsandbox102:38340] [ 6] /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_ep_put_zcopy+0x9d)[0x7f5ff05c6ebd] | |
[gcrsandbox102:38340] [ 7] /usr/local/lib/libucp.so.0(ucp_rndv_progress_rma_put_zcopy+0x1c2)[0x7f5ff147c742] | |
[gcrsandbox102:38340] [ 8] /usr/local/lib/libucp.so.0(ucp_rndv_rtr_handler+0x180)[0x7f5ff147dbe0] | |
[gcrsandbox102:38340] [ 9] /usr/local/lib/libuct.so.0(uct_mm_iface_progress+0x14d)[0x7f5ff0def70d] | |
[gcrsandbox102:38340] [10] /usr/local/lib/libucp.so.0(ucp_worker_progress+0x5a)[0x7f5ff146e2ba] | |
[gcrsandbox102:38340] [11] /usr/local/lib/openmpi/mca_pml_ucx.so(mca_pml_ucx_progress+0x17)[0x7f5ff259e2e7] | |
/lib/x86_64-linux-gnu/libc.so.6(abort+0x141)[0x7fd77dd9a801] | |
[gcrsandbox102:38338] [ 3] /usr/local/lib/libucs.so.0(+0x1d296)[0x7fd73c432296] | |
[gcrsandbox102:38338] [ 4] /usr/local/lib/libucs.so.0(+0x1d385)[0x7fd73c432385] | |
[gcrsandbox102:38338] [ 5] /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_cache_map_memhandle+0x693)[0x7fd7375c6993] | |
[gcrsandbox102:38338] [ 6] /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_ep_put_zcopy+0x9d)[0x7fd7375c5ebd] | |
[gcrsandbox102:38338] [ 7] /usr/local/lib/libucp.so.0(ucp_rndv_progress_rma_put_zcopy+0x1c2)[0x7fd73c676742] | |
[gcrsandbox102:38338] [ 8] /usr/local/lib/libucp.so.0(ucp_rndv_rtr_handler+0x180)[0x7fd73c677be0] | |
[gcrsandbox102:38338] [ 9] /usr/local/lib/libuct.so.0(uct_mm_iface_progress+0x14d)[0x7fd737dee70d] | |
[gcrsandbox102:38338] [10] /usr/local/lib/libucp.so.0(ucp_worker_progress+0x5a)[0x7fd73c6682ba] | |
[gcrsandbox102:38338] [11] /usr/local/lib/ucx/libuct_cuda.so.0(uct_cuda_ipc_ep_put_zcopy+0x9d)[0x7f77e39efebd] | |
[gcrsandbox102:38339] [ 7] /usr/local/lib/libucp.so.0(ucp_rndv_progress_rma_put_zcopy+0x1c2)[0x7f77e8a77742] | |
[gcrsandbox102:38339] [ 8] /usr/local/lib/libucp.so.0(ucp_rndv_rtr_handler+0x180)[0x7f77e8a78be0] | |
[gcrsandbox102:38339] [ 9] /usr/local/lib/libuct.so.0(uct_mm_iface_progress+0x14d)[0x7f77e83ea70d] | |
[gcrsandbox102:38339] [10] /usr/local/lib/libucp.so.0(ucp_worker_progress+0x5a)[0x7f77e8a692ba] | |
[gcrsandbox102:38339] [11] /usr/local/lib/openmpi/mca_pml_ucx.so(mca_pml_ucx_progress+0x17)[0x7f77e9b992e7] | |
[gcrsandbox102:38339] [12] /usr/local/lib/openmpi/mca_pml_ucx.so(mca_pml_ucx_progress+0x17)[0x7fd73d7982e7] | |
[gcrsandbox102:38338] [12] /usr/local/lib/libopen-pal.so.40(opal_progress+0x2c)[0x7fd77be87cec] | |
[gcrsandbox102:38338] [13] /usr/local/lib/libmpi.so.40(ompi_request_default_wait+0x4d)[0x7fd77e199f2d] | |
[gcrsandbox102:38338] [14] /usr/local/lib/libucp.so.0(ucp_worker_progress+0x5a)[0x7f280739a2ba] | |
[gcrsandbox102:38337] [11] /usr/local/lib/openmpi/mca_pml_ucx.so(mca_pml_ucx_progress+0x17)[0x7f280c59e2e7] | |
[gcrsandbox102:38337] [12] /usr/local/lib/libopen-pal.so.40(opal_progress+0x2c)[0x7f2846c41cec] | |
[gcrsandbox102:38337] [13] /usr/local/lib/libmpi.so.40(ompi_request_default_wait+0x4d)[0x7f2848f53f2d] | |
[gcrsandbox102:38337] [14] [gcrsandbox102:38340] [12] /usr/local/lib/libopen-pal.so.40(opal_progress+0x2c)[0x7f6030c2bcec] | |
[gcrsandbox102:38340] [13] /usr/local/lib/libmpi.so.40(ompi_request_default_wait+0x4d)[0x7f6032f3df2d] | |
[gcrsandbox102:38340] [14] /usr/local/lib/libopen-pal.so.40(opal_progress+0x2c)[0x7f782838dcec] | |
[gcrsandbox102:38339] [13] /usr/local/lib/libmpi.so.40(ompi_request_default_wait+0x4d)[0x7f782a69ff2d] | |
[gcrsandbox102:38339] [14] /usr/local/lib/libmpi.so.40(PMPI_Wait+0x52)[0x7f6032f84542] | |
[gcrsandbox102:38340] [15] /home/t-liuzhe/work/collcc/out/lib/libaccl.so(ACCL_Allgather+0x20f)[0x7f60334a056f] | |
[gcrsandbox102:38340] [16] ./osu_allgather_accl(+0x250b)[0x55d1888eb50b] | |
[gcrsandbox102:38340] [17] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7)[0x7f6032b1fb97] | |
[gcrsandbox102:38340] /usr/local/lib/libmpi.so.40(PMPI_Wait+0x52)[0x7fd77e1e0542] | |
[gcrsandbox102:38338] [15] /home/t-liuzhe/work/collcc/out/lib/libaccl.so(ACCL_Allgather+0x534)[0x7fd77e6fc894] | |
[gcrsandbox102:38338] [16] ./osu_allgather_accl(+0x250b)[0x56406932250b] | |
[gcrsandbox102:38338] [17] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7)[0x7fd77dd7bb97] | |
/usr/local/lib/libmpi.so.40(PMPI_Wait+0x52)[0x7f2848f9a542] | |
[gcrsandbox102:38337] [15] /home/t-liuzhe/work/collcc/out/lib/libaccl.so(ACCL_Allgather+0x3bd)[0x7f28494b671d] | |
[gcrsandbox102:38337] [16] ./osu_allgather_accl(+0x250b)[0x55d10fafa50b] | |
[gcrsandbox102:38337] [17] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7)[0x7f2848b35b97] | |
[gcrsandbox102:38337] [18] ./osu_allgather_accl(+0x299a)[0x55d10fafa99a] | |
[gcrsandbox102:38337] *** End of error message *** | |
/usr/local/lib/libmpi.so.40(PMPI_Wait+0x52)[0x7f782a6e6542] | |
[gcrsandbox102:38339] [15] /home/t-liuzhe/work/collcc/out/lib/libaccl.so(ACCL_Allgather+0x6a9)[0x7f782ac02a09] | |
[gcrsandbox102:38339] [16] ./osu_allgather_accl(+0x250b)[0x55d094cd450b] | |
[gcrsandbox102:38339] [17] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7)[0x7f782a281b97] | |
[gcrsandbox102:38339] [18] ./osu_allgather_accl(+0x299a)[0x55d094cd499a] | |
[gcrsandbox102:38339] *** End of error message *** | |
[18] ./osu_allgather_accl(+0x299a)[0x55d1888eb99a] | |
[gcrsandbox102:38340] *** End of error message *** | |
[gcrsandbox102:38338] [18] ./osu_allgather_accl(+0x299a)[0x56406932299a] | |
[gcrsandbox102:38338] *** End of error message *** | |
-------------------------------------------------------------------------- | |
Primary job terminated normally, but 1 process returned | |
a non-zero exit code. Per user-direction, the job has been aborted. | |
-------------------------------------------------------------------------- | |
-------------------------------------------------------------------------- | |
mpirun noticed that process rank 2 with PID 0 on node gcrsandbox102 exited on signal 6 (Aborted). | |
-------------------------------------------------------------------------- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment