# cat rccl-namespace.patch --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,39 +11,6 @@ include(cmake/Dependencies.cmake) -# Detect compiler support for target ID -# This section is deprecated. Please use rocm_check_target_ids for future use. -if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) - execute_process(COMMAND ${CMAKE_CXX_COMPILER} "--help" - OUTPUT_VARIABLE CXX_OUTPUT - OUTPUT_STRIP_TRAILING_WHITESPACE - ERROR_STRIP_TRAILING_WHITESPACE) - string(REGEX MATCH ".mcode\-object\-version" TARGET_ID_SUPPORT ${CXX_OUTPUT}) -endif() - -if(NOT DEFINED ROCM_PATH) - get_filename_component(_real_path ${CMAKE_CXX_COMPILER} REALPATH) - get_filename_component(_new_path "${_real_path}" DIRECTORY) - get_filename_component(ROCM_PATH "${_new_path}/../.." REALPATH) -endif() - -set(CMAKE_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "") - -#Set the AMDGPU_TARGETS with backward compatiblity -if(COMMAND rocm_check_target_ids) - rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS - TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030" - ) -else() - # Use target ID syntax if supported for AMDGPU_TARGETS - if(TARGET_ID_SUPPORT) - set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx1030") - else() - set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900;gfx906;gfx908") - endif() -endif() -set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target") - option(BUILD_TESTS "Build test programs" OFF) option(INSTALL_DEPENDENCIES "Force install dependencies" OFF) option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF) @@ -222,7 +189,7 @@ add_definitions(-DENABLE_COLLTRACE) endif() -CHECK_INCLUDE_FILE_CXX("${ROCM_PATH}/rocm_smi/include/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG) +CHECK_INCLUDE_FILE_CXX("/usr/include/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG) IF(HAVE_ROCM_SMI64CONFIG) add_definitions(-DUSE_ROCM_SMI64CONFIG) ENDIF() @@ -231,42 +198,15 @@ target_link_libraries(rccl PRIVATE --amdgpu-target=${target}) endforeach() -if("${HIP_COMPILER}" MATCHES "clang") - target_compile_options(rccl PRIVATE -fvisibility=hidden) - foreach(target ${AMDGPU_TARGETS}) - target_compile_options(rccl PRIVATE -fgpu-rdc) - endforeach() - target_link_libraries(rccl PRIVATE -fgpu-rdc) - target_include_directories(rccl PRIVATE ${ROCM_PATH}/hsa/include) - find_program( hipcc_executable hipcc ) - execute_process(COMMAND bash "-c" "${hipcc_executable} -help | grep 'parallel-jobs'" OUTPUT_VARIABLE hipcc_parallel_jobs) - if("${hipcc_parallel_jobs}" MATCHES "parallel-jobs") - target_compile_options(rccl PRIVATE -parallel-jobs=8 PRIVATE -Wno-format-nonliteral) - target_link_libraries(rccl PRIVATE -parallel-jobs=8) - endif() - - # RCCL static lib uses -fgpu-rdc which requires hipcc as the linker and archiver - if(BUILD_STATIC) - target_link_libraries(rccl PRIVATE --emit-static-lib) - set(CMAKE_AR "${hipcc_executable}") - get_property(link_libraries TARGET rccl PROPERTY LINK_LIBRARIES) - string (REPLACE ";" " " LINK_PROPS "${link_libraries}") - set(CMAKE_CXX_ARCHIVE_CREATE " -o ${LINK_PROPS} ") - endif() -endif() - -if("${HIP_COMPILER}" MATCHES "hcc") - find_program( hcc_executable hcc ) - execute_process(COMMAND bash "-c" "${hcc_executable} --version | sed -e '1!d' -e 's/.*based on HCC\\s*//'" OUTPUT_VARIABLE hcc_version_string) - execute_process(COMMAND bash "-c" "echo \"${hcc_version_string}\" | awk -F\".\" '{ printf $1}'" OUTPUT_VARIABLE hcc_major_version) - execute_process(COMMAND bash "-c" "echo \"${hcc_version_string}\" | awk -F\".\" '{ printf $2}'" OUTPUT_VARIABLE hcc_minor_version) - if ("${hcc_major_version}.${hcc_minor_version}" VERSION_LESS "4.0") - target_link_libraries(rccl PRIVATE -hc-function-calls) - endif() -endif() +target_compile_options(rccl PRIVATE -fvisibility=hidden) + +foreach(target ${AMDGPU_TARGETS}) + target_compile_options(rccl PRIVATE -fgpu-rdc) +endforeach() + +target_link_libraries(rccl PRIVATE -fgpu-rdc) -target_include_directories(rccl PRIVATE ${ROCM_PATH}/rocm_smi/include) -target_link_libraries(rccl PRIVATE hip::device dl -lrocm_smi64 -L${ROCM_PATH}/rocm_smi/lib) +target_link_libraries(rccl PRIVATE hip::device dl -lrocm_smi64) target_link_libraries(rccl INTERFACE hip::host) #Setup librccl.so version @@ -281,7 +221,7 @@ rocm_export_targets(NAMESPACE roc:: TARGETS - rccl + roc::rccl DEPENDS hip) if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY) --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -101,7 +101,7 @@ # through -l and -L instead of command line input. if(BUILD_STATIC) add_dependencies(UnitTests rccl) - target_link_libraries(UnitTests PRIVATE dl rt numa -lrccl -L${CMAKE_BINARY_DIR} -lrocm_smi64 -L${ROCM_PATH}/rocm_smi/lib) + target_link_libraries(UnitTests PRIVATE dl rt numa -lrccl -L${CMAKE_BINARY_DIR} -lrocm_smi64) else() target_link_libraries(UnitTests PRIVATE rccl) endif() # cat rccl-nccl.patch --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -221,7 +221,7 @@ ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) { volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); - while (*ptr < comm->intraRanks) pthread_yield(); + while (*ptr < comm->intraRanks) sched_yield(); comm->intraPhase ^= 1; return ncclSuccess; } --- a/src/misc/gdrwrap.cc +++ b/src/misc/gdrwrap.cc @@ -57,7 +57,7 @@ if (__sync_bool_compare_and_swap(&gdrState, gdrUninitialized, gdrInitializing) == false) { // Another thread raced in front of us. Wait for it to be done. - while (gdrState == gdrInitializing) pthread_yield(); + while (gdrState == gdrInitializing) sched_yield(); return (gdrState == gdrInitialized) ? ncclSuccess : ncclSystemError; } --- a/src/misc/ibvwrap.cc +++ b/src/misc/ibvwrap.cc @@ -48,7 +48,7 @@ if (__sync_bool_compare_and_swap(&ibvState, ibvUninitialized, ibvInitializing) == false) { // Another thread raced in front of us. Wait for it to be done. - while (ibvState == ibvInitializing) pthread_yield(); + while (ibvState == ibvInitializing) sched_yield(); return (ibvState == ibvInitialized) ? ncclSuccess : ncclSystemError; } --- a/src/misc/nvmlwrap.cc +++ b/src/misc/nvmlwrap.cc @@ -34,7 +34,7 @@ if (__sync_bool_compare_and_swap(&nvmlState, nvmlUninitialized, nvmlInitializing) == false) { // Another thread raced in front of us. Wait for it to be done. - while (nvmlState == nvmlInitializing) pthread_yield(); + while (nvmlState == nvmlInitializing) sched_yield(); return (nvmlState == nvmlInitialized) ? ncclSuccess : ncclSystemError; }