-
-
Save anonymous/e6e8822a01dde1bb20195b4002d8efc3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = '<stdin>' | |
source_filename = "cxx11_tensor_reduction_cuda-sm_35.cui" | |
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" | |
target triple = "nvptx64-nvidia-cuda" | |
%struct.__cuda_builtin_blockIdx_t = type { i8 } | |
%struct.__cuda_builtin_blockDim_t = type { i8 } | |
%struct.__cuda_builtin_threadIdx_t = type { i8 } | |
%struct.__cuda_builtin_gridDim_t = type { i8 } | |
%"struct.Eigen::internal::SumReducer" = type { i8 } | |
%"struct.Eigen::TensorEvaluator" = type { %"class.Eigen::array", %"struct.Eigen::DSizes", %"class.Eigen::array.1", %"class.Eigen::array.0", %"class.Eigen::array.2", %"class.Eigen::array.0", %"class.Eigen::array.0", i32, %"class.Eigen::array.0", %"class.Eigen::array.0", %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::internal::SumReducer", float*, i64, %"struct.Eigen::GpuDevice"* } | |
%"class.Eigen::array" = type { [2 x i8] } | |
%"struct.Eigen::DSizes" = type { %"class.Eigen::array.0" } | |
%"class.Eigen::array.1" = type { [2 x i32] } | |
%"class.Eigen::array.2" = type { [1 x %"struct.Eigen::internal::TensorIntDivisor"] } | |
%"struct.Eigen::internal::TensorIntDivisor" = type { i32, i32, i32 } | |
%"class.Eigen::array.0" = type { [1 x i32] } | |
%"struct.Eigen::TensorEvaluator.3" = type { float*, %"struct.Eigen::DSizes.4", %"struct.Eigen::GpuDevice"* } | |
%"struct.Eigen::DSizes.4" = type { %"class.Eigen::array.1" } | |
%"struct.Eigen::GpuDevice" = type { %"class.Eigen::StreamInterface"* } | |
%"class.Eigen::StreamInterface" = type { i32 (...)** } | |
%"struct.Eigen::internal::scalar_cast_op" = type { i8 } | |
%"struct.Eigen::TensorEvaluator.5" = type { %"struct.Eigen::TensorEvaluator", %"struct.Eigen::GpuDevice"*, float* } | |
%"struct.Eigen::internal::PtrWrapper" = type { float* } | |
%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer" = type { %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer" } | |
%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer" = type { %"struct.Eigen::internal::(anonymous namespace)::Identity" } | |
%"struct.Eigen::internal::(anonymous namespace)::Identity" = type { i8 } | |
%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer" = type { float } | |
%"struct.Eigen::TensorEvaluator.6" = type { %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.8" } | |
%"struct.Eigen::TensorEvaluator.7" = type { float*, %"struct.Eigen::DSizes", %"struct.Eigen::GpuDevice"* } | |
%"struct.Eigen::TensorEvaluator.8" = type { %"struct.Eigen::TensorEvaluator", %"class.Eigen::TensorReductionOp", %"struct.Eigen::GpuDevice"*, float* } | |
%"class.Eigen::TensorReductionOp" = type <{ %"class.Eigen::TensorMap"*, %"class.Eigen::array.0", %"struct.Eigen::internal::SumReducer", [3 x i8] }> | |
%"class.Eigen::TensorMap" = type { float*, %"struct.Eigen::DSizes.4" } | |
%"struct.Eigen::TensorEvaluator.11" = type { %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator" } | |
%"struct.Eigen::TensorEvaluator.12" = type { %"class.Eigen::array", %"struct.Eigen::DSizes", %"class.Eigen::array.1", %"class.Eigen::array.0", %"class.Eigen::array.2", %"class.Eigen::array.0", %"class.Eigen::array.0", i32, %"class.Eigen::array.0", %"class.Eigen::array.0", %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::internal::SumReducer", float*, i64, %"struct.Eigen::GpuDevice"* } | |
%"struct.Eigen::TensorEvaluator.13" = type { float*, %"struct.Eigen::DSizes.4", %"struct.Eigen::GpuDevice"* } | |
%"struct.Eigen::TensorEvaluator.14" = type { %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::GpuDevice"*, float* } | |
%"struct.Eigen::TensorEvaluator.15" = type { %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.17" } | |
%"struct.Eigen::TensorEvaluator.16" = type { float*, %"struct.Eigen::DSizes", %"struct.Eigen::GpuDevice"* } | |
%"struct.Eigen::TensorEvaluator.17" = type { %"struct.Eigen::TensorEvaluator.12", %"class.Eigen::TensorReductionOp.18", %"struct.Eigen::GpuDevice"*, float* } | |
%"class.Eigen::TensorReductionOp.18" = type <{ %"class.Eigen::TensorMap.20"*, %"class.Eigen::array.0", %"struct.Eigen::internal::SumReducer", [3 x i8] }> | |
%"class.Eigen::TensorMap.20" = type { float*, %"struct.Eigen::DSizes.4" } | |
%"struct.Eigen::TensorEvaluator.24" = type { %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.12" } | |
$_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_ = comdat any | |
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any | |
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any | |
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any | |
$_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv = comdat any | |
$_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE = comdat any | |
$_ZNK5Eigen8internal10SumReducerIfE10initializeEv = comdat any | |
$_ZN5Eigen6numext4miniIiEET_RKS2_S4_ = comdat any | |
$_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi = comdat any | |
$_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf = comdat any | |
$_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE = comdat any | |
$_ZN5Eigen8internal14scalar_cast_opIifEC1Ev = comdat any | |
$_ZNK5Eigen8internal14scalar_cast_opIifEclERKi = comdat any | |
$_ZN5Eigen8internal14scalar_cast_opIifEC2Ev = comdat any | |
$_ZN5Eigen8internal4castIifEET0_RKT_ = comdat any | |
$_ZN5Eigen8internal9cast_implIifE3runERKi = comdat any | |
$_Z5__ldgPKf = comdat any | |
$_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any | |
$_ZN5Eigen5divupIiijEET_T0_T1_ = comdat any | |
$_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any | |
$_ZN5Eigen5divupIiEET_S1_S1_ = comdat any | |
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any | |
$_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii = comdat any | |
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEED1Ev = comdat any | |
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi = comdat any | |
$_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEE5coeffEi = comdat any | |
$_ZN5Eigen8internal17GenericDimReducerILi0ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf = comdat any | |
$_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEE10firstInputEi = comdat any | |
$_ZNK5Eigen8internal10SumReducerIfE8finalizeEf = comdat any | |
$_ZNK5Eigen5arrayIiLm1EEixEm = comdat any | |
$_ZN5Eigen8internal17GenericDimReducerILin1ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf = comdat any | |
$_ZNK5Eigen5arrayINS_8internal16TensorIntDivisorIiLb0EEELm1EEixEm = comdat any | |
$_ZNK5Eigen8internal16TensorIntDivisorIiLb0EE6divideEi = comdat any | |
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEED2Ev = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_ = comdat any | |
$_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any | |
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv = comdat any | |
$_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv = comdat any | |
$_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any | |
$_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii = comdat any | |
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEE10evalScalarEi = comdat any | |
$_ZNK5Eigen15TensorEvaluatorIKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE5coeffEi = comdat any | |
$_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi = comdat any | |
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any | |
$_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii = comdat any | |
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE = comdat any | |
$_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi = comdat any | |
$_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any | |
$_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any | |
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any | |
$_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii = comdat any | |
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEED1Ev = comdat any | |
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi = comdat any | |
$_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEE5coeffEi = comdat any | |
$_ZN5Eigen8internal17GenericDimReducerILi0ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf = comdat any | |
$_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEE10firstInputEi = comdat any | |
$_ZN5Eigen8internal17GenericDimReducerILin1ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf = comdat any | |
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEED2Ev = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any | |
$_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii = comdat any | |
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEE10evalScalarEi = comdat any | |
$_ZNK5Eigen15TensorEvaluatorIKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE5coeffEi = comdat any | |
$_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi = comdat any | |
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any | |
$_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii = comdat any | |
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
@blockIdx = extern_weak addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 | |
@blockDim = extern_weak addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1 | |
@threadIdx = extern_weak addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 | |
@gridDim = extern_weak addrspace(1) global %struct.__cuda_builtin_gridDim_t, align 1 | |
@.str = private unnamed_addr constant [24 x i8] c"blockDim.x == BLOCK_DIM\00", align 1 | |
@.str.1 = private unnamed_addr constant [76 x i8] c"third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@.str.2 = private unnamed_addr constant [16 x i8] c"blockDim.y == 1\00", align 1 | |
@.str.3 = private unnamed_addr constant [16 x i8] c"blockDim.z == 1\00", align 1 | |
@.str.4 = private unnamed_addr constant [22 x i8] c"gridDim.x == GRID_DIM\00", align 1 | |
@.str.5 = private unnamed_addr constant [15 x i8] c"gridDim.y == 1\00", align 1 | |
@.str.6 = private unnamed_addr constant [15 x i8] c"gridDim.z == 1\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@"$str" = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00" | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_(float, i32, float*) #0 comdat { | |
%4 = alloca float, align 4 | |
%5 = alloca i32, align 4 | |
%6 = alloca float*, align 8 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
store float %0, float* %4, align 4 | |
store i32 %1, i32* %5, align 4 | |
store float* %2, float** %6, align 8 | |
%10 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %10) #9 | |
%11 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%12 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%13 = mul i32 %11, %12 | |
%14 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%15 = add i32 %13, %14 | |
store i32 %15, i32* %7, align 4 | |
%16 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %16) #9 | |
%17 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%18 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%19 = mul i32 %17, %18 | |
store i32 %19, i32* %8, align 4 | |
%20 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %20) #9 | |
%21 = load i32, i32* %7, align 4 | |
store i32 %21, i32* %9, align 4 | |
br label %22 | |
; <label>:22: ; preds = %34, %3 | |
%23 = load i32, i32* %9, align 4 | |
%24 = load i32, i32* %5, align 4 | |
%25 = icmp slt i32 %23, %24 | |
br i1 %25, label %28, label %26 | |
; <label>:26: ; preds = %22 | |
%27 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %27) #9 | |
br label %38 | |
; <label>:28: ; preds = %22 | |
%29 = load float, float* %4, align 4 | |
%30 = load i32, i32* %9, align 4 | |
%31 = sext i32 %30 to i64 | |
%32 = load float*, float** %6, align 8 | |
%33 = getelementptr inbounds float, float* %32, i64 %31 | |
store float %29, float* %33, align 4 | |
br label %34 | |
; <label>:34: ; preds = %28 | |
%35 = load i32, i32* %8, align 4 | |
%36 = load i32, i32* %9, align 4 | |
%37 = add nsw i32 %36, %35 | |
store i32 %37, i32* %9, align 4 | |
br label %22 | |
; <label>:38: ; preds = %26 | |
%39 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %39) #9 | |
%40 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %40) #9 | |
ret void | |
} | |
; Function Attrs: argmemonly nounwind | |
declare void @llvm.lifetime.start(i64, i8* nocapture) #1 | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 comdat align 2 { | |
%1 = call i32 @llvm.ptx.read.ctaid.x() | |
ret i32 %1 | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 comdat align 2 { | |
%1 = call i32 @llvm.ptx.read.ntid.x() | |
ret i32 %1 | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 comdat align 2 { | |
%1 = call i32 @llvm.ptx.read.tid.x() | |
ret i32 %1 | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #2 comdat align 2 { | |
%1 = call i32 @llvm.ptx.read.nctaid.x() | |
ret i32 %1 | |
} | |
; Function Attrs: argmemonly nounwind | |
declare void @llvm.lifetime.end(i64, i8* nocapture) #1 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.ctaid.x() #3 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.ntid.x() #3 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.tid.x() #3 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.nctaid.x() #3 | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, float*) #0 comdat { | |
%5 = alloca i32, align 4 | |
%6 = alloca float*, align 8 | |
%7 = alloca i32, align 4 | |
%8 = alloca float, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32, align 4 | |
%13 = alloca i32, align 4 | |
%14 = alloca float, align 4 | |
%15 = alloca i32, align 4 | |
store i32 %2, i32* %5, align 4 | |
store float* %3, float** %6, align 8 | |
%16 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %16) #9 | |
%17 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%18 = mul i32 %17, 256 | |
%19 = mul i32 %18, 128 | |
%20 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%21 = add i32 %19, %20 | |
store i32 %21, i32* %7, align 4 | |
%22 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %24, label %31 | |
; <label>:24: ; preds = %4 | |
%25 = load i32, i32* %7, align 4 | |
%26 = icmp eq i32 %25, 0 | |
br i1 %26, label %27, label %30 | |
; <label>:27: ; preds = %24 | |
%28 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10 | |
%29 = load float*, float** %6, align 8 | |
store float %28, float* %29, align 4 | |
br label %30 | |
; <label>:30: ; preds = %27, %24 | |
call void @llvm.cuda.syncthreads() | |
br label %31 | |
; <label>:31: ; preds = %30, %4 | |
%32 = bitcast float* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %32) #9 | |
%33 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10 | |
store float %33, float* %8, align 4 | |
%34 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %34) #9 | |
%35 = load i32, i32* %5, align 4 | |
%36 = load i32, i32* %7, align 4 | |
%37 = sub nsw i32 %35, %36 | |
store i32 %37, i32* %10, align 4 | |
store i32 32768, i32* %11, align 4 | |
%38 = call i32 @_ZN5Eigen6numext4miniIiEET_RKS2_S4_(i32* dereferenceable(4) %10, i32* dereferenceable(4) %11) #10 | |
store i32 %38, i32* %9, align 4 | |
%39 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %39) #9 | |
store i32 0, i32* %12, align 4 | |
br label %40 | |
; <label>:40: ; preds = %58, %31 | |
%41 = load i32, i32* %12, align 4 | |
%42 = load i32, i32* %9, align 4 | |
%43 = icmp slt i32 %41, %42 | |
br i1 %43, label %46, label %44 | |
; <label>:44: ; preds = %40 | |
%45 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %45) #9 | |
br label %61 | |
; <label>:46: ; preds = %40 | |
%47 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %47) #9 | |
%48 = load i32, i32* %7, align 4 | |
%49 = load i32, i32* %12, align 4 | |
%50 = add nsw i32 %48, %49 | |
store i32 %50, i32* %13, align 4 | |
%51 = bitcast float* %14 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %51) #9 | |
%52 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i32 0, i32 10 | |
%53 = load i32, i32* %13, align 4 | |
%54 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %52, i32 %53) #10 | |
store float %54, float* %14, align 4 | |
%55 = load float, float* %14, align 4 | |
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %55, float* %8) #10 | |
%56 = bitcast float* %14 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %56) #9 | |
%57 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %57) #9 | |
br label %58 | |
; <label>:58: ; preds = %46 | |
%59 = load i32, i32* %12, align 4 | |
%60 = add nsw i32 %59, 256 | |
store i32 %60, i32* %12, align 4 | |
br label %40, !llvm.loop !46 | |
; <label>:61: ; preds = %44 | |
%62 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %62) #9 | |
store i32 16, i32* %15, align 4 | |
br label %63 | |
; <label>:63: ; preds = %72, %61 | |
%64 = load i32, i32* %15, align 4 | |
%65 = icmp sgt i32 %64, 0 | |
br i1 %65, label %68, label %66 | |
; <label>:66: ; preds = %63 | |
%67 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %67) #9 | |
br label %75 | |
; <label>:68: ; preds = %63 | |
%69 = load float, float* %8, align 4 | |
%70 = load i32, i32* %15, align 4 | |
%71 = call float @_ZL11__shfl_downfji(float %69, i32 %70, i32 32) #10 | |
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %71, float* %8) #10 | |
br label %72 | |
; <label>:72: ; preds = %68 | |
%73 = load i32, i32* %15, align 4 | |
%74 = sdiv i32 %73, 2 | |
store i32 %74, i32* %15, align 4 | |
br label %63, !llvm.loop !48 | |
; <label>:75: ; preds = %66 | |
%76 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%77 = and i32 %76, 31 | |
%78 = icmp eq i32 %77, 0 | |
br i1 %78, label %79, label %82 | |
; <label>:79: ; preds = %75 | |
%80 = load float*, float** %6, align 8 | |
%81 = load float, float* %8, align 4 | |
call void @_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE(float* %80, float %81, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %0) #10 | |
br label %82 | |
; <label>:82: ; preds = %79, %75 | |
%83 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %83) #9 | |
%84 = bitcast float* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %84) #9 | |
%85 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %85) #9 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"*) #4 comdat align 2 { | |
%2 = alloca %"struct.Eigen::internal::SumReducer"*, align 8 | |
%3 = alloca %"struct.Eigen::internal::scalar_cast_op", align 1 | |
%4 = alloca i32, align 4 | |
store %"struct.Eigen::internal::SumReducer"* %0, %"struct.Eigen::internal::SumReducer"** %2, align 8 | |
%5 = load %"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::internal::SumReducer"** %2, align 8 | |
%6 = bitcast %"struct.Eigen::internal::scalar_cast_op"* %3 to i8* | |
call void @llvm.lifetime.start(i64 1, i8* %6) #9 | |
call void @_ZN5Eigen8internal14scalar_cast_opIifEC1Ev(%"struct.Eigen::internal::scalar_cast_op"* %3) #10 | |
store i32 0, i32* %4, align 4 | |
%7 = call float @_ZNK5Eigen8internal14scalar_cast_opIifEclERKi(%"struct.Eigen::internal::scalar_cast_op"* %3, i32* dereferenceable(4) %4) #10 | |
%8 = bitcast %"struct.Eigen::internal::scalar_cast_op"* %3 to i8* | |
call void @llvm.lifetime.end(i64 1, i8* %8) #9 | |
ret float %7 | |
} | |
; Function Attrs: convergent nounwind | |
declare void @llvm.cuda.syncthreads() #5 | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr i32 @_ZN5Eigen6numext4miniIiEET_RKS2_S4_(i32* dereferenceable(4), i32* dereferenceable(4)) #2 comdat { | |
%3 = alloca i32*, align 8 | |
%4 = alloca i32*, align 8 | |
store i32* %0, i32** %3, align 8 | |
store i32* %1, i32** %4, align 8 | |
%5 = load i32*, i32** %4, align 8 | |
%6 = load i32, i32* %5, align 4 | |
%7 = load i32*, i32** %3, align 8 | |
%8 = load i32, i32* %7, align 4 | |
%9 = icmp slt i32 %6, %8 | |
br i1 %9, label %10, label %13 | |
; <label>:10: ; preds = %2 | |
%11 = load i32*, i32** %4, align 8 | |
%12 = load i32, i32* %11, align 4 | |
br label %16 | |
; <label>:13: ; preds = %2 | |
%14 = load i32*, i32** %3, align 8 | |
%15 = load i32, i32* %14, align 4 | |
br label %16 | |
; <label>:16: ; preds = %13, %10 | |
%17 = phi i32 [ %12, %10 ], [ %15, %13 ] | |
ret i32 %17 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator.3"*, align 8 | |
%4 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator.3"* %0, %"struct.Eigen::TensorEvaluator.3"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%5 = load %"struct.Eigen::TensorEvaluator.3"*, %"struct.Eigen::TensorEvaluator.3"** %3, align 8 | |
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %5, i32 0, i32 0 | |
%7 = load float*, float** %6, align 8 | |
%8 = load i32, i32* %4, align 4 | |
%9 = sext i32 %8 to i64 | |
%10 = getelementptr inbounds float, float* %7, i64 %9 | |
%11 = call float @_ZN5Eigen12_GLOBAL__N_112loadConstantIfEET_PKS2_(float* %10) #10 | |
ret float %11 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"*, float, float*) #4 comdat align 2 { | |
%4 = alloca %"struct.Eigen::internal::SumReducer"*, align 8 | |
%5 = alloca float, align 4 | |
%6 = alloca float*, align 8 | |
store %"struct.Eigen::internal::SumReducer"* %0, %"struct.Eigen::internal::SumReducer"** %4, align 8 | |
store float %1, float* %5, align 4 | |
store float* %2, float** %6, align 8 | |
%7 = load %"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::internal::SumReducer"** %4, align 8 | |
%8 = load float, float* %5, align 4 | |
%9 = load float*, float** %6, align 8 | |
%10 = load float, float* %9, align 4 | |
%11 = fadd float %10, %8 | |
store float %11, float* %9, align 4 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define internal float @_ZL11__shfl_downfji(float, i32, i32) #4 { | |
%4 = alloca float, align 4 | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
%7 = alloca float, align 4 | |
%8 = alloca i32, align 4 | |
store float %0, float* %4, align 4 | |
store i32 %1, i32* %5, align 4 | |
store i32 %2, i32* %6, align 4 | |
%9 = bitcast float* %7 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
%10 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %10) #9 | |
%11 = load i32, i32* %6, align 4 | |
%12 = sub nsw i32 32, %11 | |
%13 = shl i32 %12, 8 | |
%14 = or i32 %13, 31 | |
store i32 %14, i32* %8, align 4 | |
%15 = load float, float* %4, align 4 | |
%16 = load i32, i32* %5, align 4 | |
%17 = load i32, i32* %8, align 4 | |
%18 = call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %15, i32 %16, i32 %17) #5, !srcloc !50 | |
store float %18, float* %7, align 4 | |
%19 = load float, float* %7, align 4 | |
%20 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %20) #9 | |
%21 = bitcast float* %7 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %21) #9 | |
ret float %19 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE(float*, float, %"struct.Eigen::internal::SumReducer"* dereferenceable(1)) #4 comdat { | |
%4 = alloca float*, align 8 | |
%5 = alloca float, align 4 | |
%6 = alloca %"struct.Eigen::internal::SumReducer"*, align 8 | |
store float* %0, float** %4, align 8 | |
store float %1, float* %5, align 4 | |
store %"struct.Eigen::internal::SumReducer"* %2, %"struct.Eigen::internal::SumReducer"** %6, align 8 | |
%7 = load float*, float** %4, align 8 | |
%8 = load float, float* %5, align 4 | |
%9 = call float @_ZL9atomicAddPff(float* %7, float %8) #10 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen8internal14scalar_cast_opIifEC1Ev(%"struct.Eigen::internal::scalar_cast_op"*) unnamed_addr #4 comdat align 2 { | |
%2 = alloca %"struct.Eigen::internal::scalar_cast_op"*, align 8 | |
store %"struct.Eigen::internal::scalar_cast_op"* %0, %"struct.Eigen::internal::scalar_cast_op"** %2, align 8 | |
%3 = load %"struct.Eigen::internal::scalar_cast_op"*, %"struct.Eigen::internal::scalar_cast_op"** %2, align 8 | |
call void @_ZN5Eigen8internal14scalar_cast_opIifEC2Ev(%"struct.Eigen::internal::scalar_cast_op"* %3) #10 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr float @_ZNK5Eigen8internal14scalar_cast_opIifEclERKi(%"struct.Eigen::internal::scalar_cast_op"*, i32* dereferenceable(4)) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::internal::scalar_cast_op"*, align 8 | |
%4 = alloca i32*, align 8 | |
store %"struct.Eigen::internal::scalar_cast_op"* %0, %"struct.Eigen::internal::scalar_cast_op"** %3, align 8 | |
store i32* %1, i32** %4, align 8 | |
%5 = load %"struct.Eigen::internal::scalar_cast_op"*, %"struct.Eigen::internal::scalar_cast_op"** %3, align 8 | |
%6 = load i32*, i32** %4, align 8 | |
%7 = call float @_ZN5Eigen8internal4castIifEET0_RKT_(i32* dereferenceable(4) %6) #10 | |
ret float %7 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen8internal14scalar_cast_opIifEC2Ev(%"struct.Eigen::internal::scalar_cast_op"*) unnamed_addr #4 comdat align 2 { | |
%2 = alloca %"struct.Eigen::internal::scalar_cast_op"*, align 8 | |
store %"struct.Eigen::internal::scalar_cast_op"* %0, %"struct.Eigen::internal::scalar_cast_op"** %2, align 8 | |
%3 = load %"struct.Eigen::internal::scalar_cast_op"*, %"struct.Eigen::internal::scalar_cast_op"** %2, align 8 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr float @_ZN5Eigen8internal4castIifEET0_RKT_(i32* dereferenceable(4)) #4 comdat { | |
%2 = alloca i32*, align 8 | |
store i32* %0, i32** %2, align 8 | |
%3 = load i32*, i32** %2, align 8 | |
%4 = call float @_ZN5Eigen8internal9cast_implIifE3runERKi(i32* dereferenceable(4) %3) #10 | |
ret float %4 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr float @_ZN5Eigen8internal9cast_implIifE3runERKi(i32* dereferenceable(4)) #4 comdat align 2 { | |
%2 = alloca i32*, align 8 | |
store i32* %0, i32** %2, align 8 | |
%3 = load i32*, i32** %2, align 8 | |
%4 = load i32, i32* %3, align 4 | |
%5 = sitofp i32 %4 to float | |
ret float %5 | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define internal float @_ZN5Eigen12_GLOBAL__N_112loadConstantIfEET_PKS2_(float*) #2 { | |
%2 = alloca float*, align 8 | |
store float* %0, float** %2, align 8 | |
%3 = load float*, float** %2, align 8 | |
%4 = call float @_Z5__ldgPKf(float* %3) #10 | |
ret float %4 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr float @_Z5__ldgPKf(float*) #4 comdat { | |
%2 = alloca float*, align 8 | |
store float* %0, float** %2, align 8 | |
%3 = load float*, float** %2, align 8 | |
%4 = call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %3, i32 4) | |
ret float %4 | |
} | |
; Function Attrs: argmemonly nounwind readonly | |
declare float @llvm.nvvm.ldg.global.f.f32.p0f32(float* nocapture, i32) #6 | |
; Function Attrs: convergent inlinehint nounwind | |
define internal float @_ZL9atomicAddPff(float*, float) #4 { | |
%3 = alloca float*, align 8 | |
%4 = alloca float, align 4 | |
store float* %0, float** %3, align 8 | |
store float %1, float* %4, align 4 | |
%5 = load float*, float** %3, align 8 | |
%6 = load float, float* %4, align 4 | |
%7 = call float @_ZL12__fAtomicAddPff(float* %5, float %6) #10 | |
ret float %7 | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define internal float @_ZL12__fAtomicAddPff(float*, float) #2 { | |
%3 = alloca float*, align 8 | |
%4 = alloca float, align 4 | |
store float* %0, float** %3, align 8 | |
store float %1, float* %4, align 4 | |
%5 = load float*, float** %3, align 8 | |
%6 = load float, float* %4, align 4 | |
%7 = call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %5, float %6) | |
ret float %7 | |
} | |
; Function Attrs: argmemonly nounwind | |
declare float @llvm.nvvm.atomic.load.add.f32.p0f32(float* nocapture, float) #1 | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, i32, float*) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca float*, align 8 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32, align 4 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32, align 4 | |
%15 = alloca i32, align 4 | |
%16 = alloca i32 | |
%17 = alloca i32, align 4 | |
%18 = alloca i32, align 4 | |
%19 = alloca i32, align 4 | |
%20 = alloca float, align 4 | |
%21 = alloca i32, align 4 | |
%22 = alloca i32, align 4 | |
%23 = alloca i32, align 4 | |
%24 = alloca float, align 4 | |
%25 = alloca i32, align 4 | |
%26 = alloca i32, align 4 | |
%27 = alloca i32, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
store float* %4, float** %8, align 8 | |
%28 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %28) #9 | |
store i32 16, i32* %9, align 4 | |
%29 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %29) #9 | |
%30 = load i32, i32* %6, align 4 | |
%31 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%32 = mul i32 %31, 128 | |
%33 = call i32 @_ZN5Eigen5divupIiijEET_T0_T1_(i32 %30, i32 %32) #10 | |
store i32 %33, i32* %10, align 4 | |
%34 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %34) #9 | |
%35 = load i32, i32* %10, align 4 | |
%36 = load i32, i32* %7, align 4 | |
%37 = mul nsw i32 %35, %36 | |
store i32 %37, i32* %11, align 4 | |
%38 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %38) #9 | |
%39 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%40 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%41 = mul i32 %39, %40 | |
store i32 %41, i32* %12, align 4 | |
%42 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %42) #9 | |
%43 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%44 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%45 = mul i32 %43, %44 | |
%46 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%47 = add i32 %45, %46 | |
store i32 %47, i32* %13, align 4 | |
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%49 = icmp eq i32 %48, 1 | |
br i1 %49, label %50, label %70 | |
; <label>:50: ; preds = %5 | |
%51 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %51) #9 | |
%52 = load i32, i32* %13, align 4 | |
store i32 %52, i32* %14, align 4 | |
br label %53 | |
; <label>:53: ; preds = %65, %50 | |
%54 = load i32, i32* %14, align 4 | |
%55 = load i32, i32* %7, align 4 | |
%56 = icmp slt i32 %54, %55 | |
br i1 %56, label %59, label %57 | |
; <label>:57: ; preds = %53 | |
%58 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %58) #9 | |
br label %69 | |
; <label>:59: ; preds = %53 | |
%60 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10 | |
%61 = load i32, i32* %14, align 4 | |
%62 = sext i32 %61 to i64 | |
%63 = load float*, float** %8, align 8 | |
%64 = getelementptr inbounds float, float* %63, i64 %62 | |
store float %60, float* %64, align 4 | |
br label %65 | |
; <label>:65: ; preds = %59 | |
%66 = load i32, i32* %12, align 4 | |
%67 = load i32, i32* %14, align 4 | |
%68 = add nsw i32 %67, %66 | |
store i32 %68, i32* %14, align 4 | |
br label %53 | |
; <label>:69: ; preds = %57 | |
br label %70 | |
; <label>:70: ; preds = %69, %5 | |
%71 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %71) #9 | |
%72 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %72, i32* %15, align 4 | |
br label %73 | |
; <label>:73: ; preds = %215, %70 | |
%74 = load i32, i32* %15, align 4 | |
%75 = load i32, i32* %11, align 4 | |
%76 = icmp slt i32 %74, %75 | |
br i1 %76, label %79, label %77 | |
; <label>:77: ; preds = %73 | |
store i32 5, i32* %16, align 4 | |
%78 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %78) #9 | |
br label %219 | |
; <label>:79: ; preds = %73 | |
%80 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %80) #9 | |
%81 = load i32, i32* %15, align 4 | |
%82 = load i32, i32* %10, align 4 | |
%83 = sdiv i32 %81, %82 | |
store i32 %83, i32* %17, align 4 | |
%84 = load i32, i32* %17, align 4 | |
%85 = load i32, i32* %7, align 4 | |
%86 = icmp slt i32 %84, %85 | |
br i1 %86, label %87, label %213 | |
; <label>:87: ; preds = %79 | |
%88 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %88) #9 | |
%89 = load i32, i32* %15, align 4 | |
%90 = load i32, i32* %10, align 4 | |
%91 = srem i32 %89, %90 | |
store i32 %91, i32* %18, align 4 | |
%92 = bitcast i32* %19 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %92) #9 | |
%93 = load i32, i32* %18, align 4 | |
%94 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%95 = mul i32 %93, %94 | |
%96 = mul i32 %95, 128 | |
%97 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%98 = add i32 %96, %97 | |
store i32 %98, i32* %19, align 4 | |
%99 = bitcast float* %20 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %99) #9 | |
%100 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10 | |
store float %100, float* %20, align 4 | |
%101 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %101) #9 | |
store i32 0, i32* %21, align 4 | |
br label %102 | |
; <label>:102: ; preds = %180, %87 | |
%103 = load i32, i32* %21, align 4 | |
%104 = icmp slt i32 %103, 128 | |
br i1 %104, label %106, label %105 | |
; <label>:105: ; preds = %102 | |
store i32 8, i32* %16, align 4 | |
br label %183 | |
; <label>:106: ; preds = %102 | |
%107 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %107) #9 | |
%108 = load i32, i32* %19, align 4 | |
%109 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%110 = load i32, i32* %21, align 4 | |
%111 = add nsw i32 %110, 16 | |
%112 = sub nsw i32 %111, 1 | |
%113 = mul i32 %109, %112 | |
%114 = add i32 %108, %113 | |
store i32 %114, i32* %22, align 4 | |
%115 = load i32, i32* %22, align 4 | |
%116 = load i32, i32* %6, align 4 | |
%117 = icmp sge i32 %115, %116 | |
br i1 %117, label %118, label %147 | |
; <label>:118: ; preds = %106 | |
%119 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %119) #9 | |
%120 = load i32, i32* %19, align 4 | |
%121 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%122 = load i32, i32* %21, align 4 | |
%123 = mul i32 %121, %122 | |
%124 = add i32 %120, %123 | |
store i32 %124, i32* %23, align 4 | |
br label %125 | |
; <label>:125: ; preds = %142, %118 | |
%126 = load i32, i32* %23, align 4 | |
%127 = load i32, i32* %6, align 4 | |
%128 = icmp slt i32 %126, %127 | |
br i1 %128, label %131, label %129 | |
; <label>:129: ; preds = %125 | |
store i32 11, i32* %16, align 4 | |
%130 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %130) #9 | |
br label %146 | |
; <label>:131: ; preds = %125 | |
%132 = bitcast float* %24 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %132) #9 | |
%133 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i32 0, i32 10 | |
%134 = load i32, i32* %17, align 4 | |
%135 = load i32, i32* %6, align 4 | |
%136 = mul nsw i32 %134, %135 | |
%137 = load i32, i32* %23, align 4 | |
%138 = add nsw i32 %136, %137 | |
%139 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %133, i32 %138) #10 | |
store float %139, float* %24, align 4 | |
%140 = load float, float* %24, align 4 | |
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %140, float* %20) #10 | |
%141 = bitcast float* %24 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %141) #9 | |
br label %142 | |
; <label>:142: ; preds = %131 | |
%143 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%144 = load i32, i32* %23, align 4 | |
%145 = add i32 %144, %143 | |
store i32 %145, i32* %23, align 4 | |
br label %125 | |
; <label>:146: ; preds = %129 | |
store i32 8, i32* %16, align 4 | |
br label %176 | |
; <label>:147: ; preds = %106 | |
%148 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %148) #9 | |
store i32 0, i32* %25, align 4 | |
br label %149 | |
; <label>:149: ; preds = %171, %147 | |
%150 = load i32, i32* %25, align 4 | |
%151 = icmp slt i32 %150, 16 | |
br i1 %151, label %154, label %152 | |
; <label>:152: ; preds = %149 | |
store i32 14, i32* %16, align 4 | |
%153 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %153) #9 | |
br label %174 | |
; <label>:154: ; preds = %149 | |
%155 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %155) #9 | |
%156 = load i32, i32* %19, align 4 | |
%157 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%158 = load i32, i32* %21, align 4 | |
%159 = load i32, i32* %25, align 4 | |
%160 = add nsw i32 %158, %159 | |
%161 = mul i32 %157, %160 | |
%162 = add i32 %156, %161 | |
store i32 %162, i32* %26, align 4 | |
%163 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i32 0, i32 10 | |
%164 = load i32, i32* %17, align 4 | |
%165 = load i32, i32* %6, align 4 | |
%166 = mul nsw i32 %164, %165 | |
%167 = load i32, i32* %26, align 4 | |
%168 = add nsw i32 %166, %167 | |
%169 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %163, i32 %168) #10 | |
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %169, float* %20) #10 | |
%170 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %170) #9 | |
br label %171 | |
; <label>:171: ; preds = %154 | |
%172 = load i32, i32* %25, align 4 | |
%173 = add nsw i32 %172, 1 | |
store i32 %173, i32* %25, align 4 | |
br label %149, !llvm.loop !51 | |
; <label>:174: ; preds = %152 | |
br label %175 | |
; <label>:175: ; preds = %174 | |
store i32 0, i32* %16, align 4 | |
br label %176 | |
; <label>:176: ; preds = %175, %146 | |
%177 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %177) #9 | |
%178 = load i32, i32* %16, align 4 | |
switch i32 %178, label %183 [ | |
i32 0, label %179 | |
] | |
; <label>:179: ; preds = %176 | |
br label %180 | |
; <label>:180: ; preds = %179 | |
%181 = load i32, i32* %21, align 4 | |
%182 = add nsw i32 %181, 16 | |
store i32 %182, i32* %21, align 4 | |
br label %102 | |
; <label>:183: ; preds = %176, %105 | |
%184 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %184) #9 | |
br label %185 | |
; <label>:185: ; preds = %183 | |
call void @llvm.cuda.syncthreads() | |
%186 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %186) #9 | |
store i32 16, i32* %27, align 4 | |
br label %187 | |
; <label>:187: ; preds = %196, %185 | |
%188 = load i32, i32* %27, align 4 | |
%189 = icmp sgt i32 %188, 0 | |
br i1 %189, label %192, label %190 | |
; <label>:190: ; preds = %187 | |
store i32 17, i32* %16, align 4 | |
%191 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %191) #9 | |
br label %199 | |
; <label>:192: ; preds = %187 | |
%193 = load float, float* %20, align 4 | |
%194 = load i32, i32* %27, align 4 | |
%195 = call float @_ZL11__shfl_downfji(float %193, i32 %194, i32 32) #10 | |
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %195, float* %20) #10 | |
br label %196 | |
; <label>:196: ; preds = %192 | |
%197 = load i32, i32* %27, align 4 | |
%198 = sdiv i32 %197, 2 | |
store i32 %198, i32* %27, align 4 | |
br label %187 | |
; <label>:199: ; preds = %190 | |
%200 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%201 = and i32 %200, 31 | |
%202 = icmp eq i32 %201, 0 | |
br i1 %202, label %203, label %209 | |
; <label>:203: ; preds = %199 | |
%204 = load i32, i32* %17, align 4 | |
%205 = sext i32 %204 to i64 | |
%206 = load float*, float** %8, align 8 | |
%207 = getelementptr inbounds float, float* %206, i64 %205 | |
%208 = load float, float* %20, align 4 | |
call void @_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE(float* %207, float %208, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %0) #10 | |
br label %209 | |
; <label>:209: ; preds = %203, %199 | |
%210 = bitcast float* %20 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %210) #9 | |
%211 = bitcast i32* %19 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %211) #9 | |
%212 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %212) #9 | |
br label %213 | |
; <label>:213: ; preds = %209, %79 | |
call void @llvm.cuda.syncthreads() | |
%214 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %214) #9 | |
br label %215 | |
; <label>:215: ; preds = %213 | |
%216 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%217 = load i32, i32* %15, align 4 | |
%218 = add i32 %217, %216 | |
store i32 %218, i32* %15, align 4 | |
br label %73 | |
; <label>:219: ; preds = %77 | |
%220 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %220) #9 | |
%221 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %221) #9 | |
%222 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %222) #9 | |
%223 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %223) #9 | |
%224 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %224) #9 | |
ret void | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr i32 @_ZN5Eigen5divupIiijEET_T0_T1_(i32, i32) #2 comdat { | |
%3 = alloca i32, align 4 | |
%4 = alloca i32, align 4 | |
store i32 %0, i32* %3, align 4 | |
store i32 %1, i32* %4, align 4 | |
%5 = load i32, i32* %3, align 4 | |
%6 = load i32, i32* %4, align 4 | |
%7 = add i32 %5, %6 | |
%8 = sub i32 %7, 1 | |
%9 = load i32, i32* %4, align 4 | |
%10 = udiv i32 %8, %9 | |
ret i32 %10 | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, i32, float*) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca float*, align 8 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32, align 4 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32 | |
%15 = alloca i32, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca float, align 4 | |
%18 = alloca i32, align 4 | |
%19 = alloca i32, align 4 | |
%20 = alloca i32, align 4 | |
%21 = alloca float, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
store float* %4, float** %8, align 8 | |
%22 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %22) #9 | |
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%24 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%25 = mul i32 %23, %24 | |
store i32 %25, i32* %9, align 4 | |
%26 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %26) #9 | |
%27 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%29 = mul i32 %27, %28 | |
%30 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%31 = add i32 %29, %30 | |
store i32 %31, i32* %10, align 4 | |
%32 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%33 = icmp eq i32 %32, 1 | |
br i1 %33, label %34, label %54 | |
; <label>:34: ; preds = %5 | |
%35 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %35) #9 | |
%36 = load i32, i32* %10, align 4 | |
store i32 %36, i32* %11, align 4 | |
br label %37 | |
; <label>:37: ; preds = %49, %34 | |
%38 = load i32, i32* %11, align 4 | |
%39 = load i32, i32* %7, align 4 | |
%40 = icmp slt i32 %38, %39 | |
br i1 %40, label %43, label %41 | |
; <label>:41: ; preds = %37 | |
%42 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %42) #9 | |
br label %53 | |
; <label>:43: ; preds = %37 | |
%44 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10 | |
%45 = load i32, i32* %11, align 4 | |
%46 = sext i32 %45 to i64 | |
%47 = load float*, float** %8, align 8 | |
%48 = getelementptr inbounds float, float* %47, i64 %46 | |
store float %44, float* %48, align 4 | |
br label %49 | |
; <label>:49: ; preds = %43 | |
%50 = load i32, i32* %9, align 4 | |
%51 = load i32, i32* %11, align 4 | |
%52 = add nsw i32 %51, %50 | |
store i32 %52, i32* %11, align 4 | |
br label %37 | |
; <label>:53: ; preds = %41 | |
call void @llvm.cuda.syncthreads() | |
br label %54 | |
; <label>:54: ; preds = %53, %5 | |
%55 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %55) #9 | |
%56 = load i32, i32* %7, align 4 | |
%57 = load i32, i32* %6, align 4 | |
%58 = call i32 @_ZN5Eigen5divupIiEET_S1_S1_(i32 %57, i32 16) #10 | |
%59 = mul nsw i32 %56, %58 | |
store i32 %59, i32* %12, align 4 | |
%60 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %60) #9 | |
%61 = load i32, i32* %10, align 4 | |
store i32 %61, i32* %13, align 4 | |
br label %62 | |
; <label>:62: ; preds = %116, %54 | |
%63 = load i32, i32* %13, align 4 | |
%64 = load i32, i32* %12, align 4 | |
%65 = icmp slt i32 %63, %64 | |
br i1 %65, label %68, label %66 | |
; <label>:66: ; preds = %62 | |
store i32 5, i32* %14, align 4 | |
%67 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %67) #9 | |
br label %120 | |
; <label>:68: ; preds = %62 | |
%69 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %69) #9 | |
%70 = load i32, i32* %13, align 4 | |
%71 = load i32, i32* %7, align 4 | |
%72 = srem i32 %70, %71 | |
store i32 %72, i32* %15, align 4 | |
%73 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %73) #9 | |
%74 = load i32, i32* %13, align 4 | |
%75 = load i32, i32* %7, align 4 | |
%76 = sdiv i32 %74, %75 | |
%77 = mul nsw i32 %76, 16 | |
store i32 %77, i32* %16, align 4 | |
%78 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %78) #9 | |
%79 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10 | |
store float %79, float* %17, align 4 | |
%80 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %80) #9 | |
%81 = load i32, i32* %16, align 4 | |
%82 = add nsw i32 %81, 16 | |
store i32 %82, i32* %19, align 4 | |
%83 = call i32 @_ZN5Eigen6numext4miniIiEET_RKS2_S4_(i32* dereferenceable(4) %19, i32* dereferenceable(4) %6) #10 | |
store i32 %83, i32* %18, align 4 | |
%84 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %84) #9 | |
%85 = load i32, i32* %16, align 4 | |
store i32 %85, i32* %20, align 4 | |
br label %86 | |
; <label>:86: ; preds = %103, %68 | |
%87 = load i32, i32* %20, align 4 | |
%88 = load i32, i32* %18, align 4 | |
%89 = icmp slt i32 %87, %88 | |
br i1 %89, label %92, label %90 | |
; <label>:90: ; preds = %86 | |
store i32 8, i32* %14, align 4 | |
%91 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %91) #9 | |
br label %106 | |
; <label>:92: ; preds = %86 | |
%93 = bitcast float* %21 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %93) #9 | |
%94 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i32 0, i32 10 | |
%95 = load i32, i32* %20, align 4 | |
%96 = load i32, i32* %7, align 4 | |
%97 = mul nsw i32 %95, %96 | |
%98 = load i32, i32* %15, align 4 | |
%99 = add nsw i32 %97, %98 | |
%100 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %94, i32 %99) #10 | |
store float %100, float* %21, align 4 | |
%101 = load float, float* %21, align 4 | |
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %101, float* %17) #10 | |
%102 = bitcast float* %21 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %102) #9 | |
br label %103 | |
; <label>:103: ; preds = %92 | |
%104 = load i32, i32* %20, align 4 | |
%105 = add nsw i32 %104, 1 | |
store i32 %105, i32* %20, align 4 | |
br label %86 | |
; <label>:106: ; preds = %90 | |
%107 = load i32, i32* %15, align 4 | |
%108 = sext i32 %107 to i64 | |
%109 = load float*, float** %8, align 8 | |
%110 = getelementptr inbounds float, float* %109, i64 %108 | |
%111 = load float, float* %17, align 4 | |
call void @_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE(float* %110, float %111, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %0) #10 | |
%112 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %112) #9 | |
%113 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %113) #9 | |
%114 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %114) #9 | |
%115 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %115) #9 | |
br label %116 | |
; <label>:116: ; preds = %106 | |
%117 = load i32, i32* %9, align 4 | |
%118 = load i32, i32* %13, align 4 | |
%119 = add nsw i32 %118, %117 | |
store i32 %119, i32* %13, align 4 | |
br label %62 | |
; <label>:120: ; preds = %66 | |
%121 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %121) #9 | |
%122 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %122) #9 | |
%123 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %123) #9 | |
ret void | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr i32 @_ZN5Eigen5divupIiEET_S1_S1_(i32, i32) #2 comdat { | |
%3 = alloca i32, align 4 | |
%4 = alloca i32, align 4 | |
store i32 %0, i32* %3, align 4 | |
store i32 %1, i32* %4, align 4 | |
%5 = load i32, i32* %3, align 4 | |
%6 = load i32, i32* %4, align 4 | |
%7 = add nsw i32 %5, %6 | |
%8 = sub nsw i32 %7, 1 | |
%9 = load i32, i32* %4, align 4 | |
%10 = sdiv i32 %8, %9 | |
ret i32 %10 | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.5"* byval align 8, i32) #0 comdat { | |
%3 = alloca i32, align 4 | |
%4 = alloca i32, align 4 | |
%5 = alloca i32, align 4 | |
%6 = alloca %"struct.Eigen::TensorEvaluator.5", align 8 | |
%7 = alloca i8, align 1 | |
%8 = alloca %"struct.Eigen::TensorEvaluator.5", align 8 | |
store i32 %1, i32* %3, align 4 | |
%9 = bitcast i32* %4 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
%10 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%11 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%12 = mul i32 %10, %11 | |
%13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%14 = add i32 %12, %13 | |
store i32 %14, i32* %4, align 4 | |
%15 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %15) #9 | |
%16 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%17 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%18 = mul i32 %16, %17 | |
store i32 %18, i32* %5, align 4 | |
%19 = bitcast %"struct.Eigen::TensorEvaluator.5"* %6 to i8* | |
call void @llvm.lifetime.start(i64 128, i8* %19) #9 | |
%20 = bitcast %"struct.Eigen::TensorEvaluator.5"* %6 to i8* | |
%21 = bitcast %"struct.Eigen::TensorEvaluator.5"* %0 to i8* | |
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %20, i8* %21, i64 128, i32 8, i1 false) | |
call void @llvm.lifetime.start(i64 1, i8* %7) #9 | |
store i8 0, i8* %7, align 1 | |
%22 = bitcast %"struct.Eigen::TensorEvaluator.5"* %8 to i8* | |
%23 = bitcast %"struct.Eigen::TensorEvaluator.5"* %6 to i8* | |
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %22, i8* %23, i64 128, i32 8, i1 false) | |
%24 = load i32, i32* %4, align 4 | |
%25 = load i32, i32* %3, align 4 | |
%26 = load i32, i32* %5, align 4 | |
call void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii(%"struct.Eigen::TensorEvaluator.5"* byval align 8 %8, i32 %24, i32 %25, i32 %26) #10 | |
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEED1Ev(%"struct.Eigen::TensorEvaluator.5"* %8) #5 | |
call void @llvm.lifetime.end(i64 1, i8* %7) #9 | |
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEED1Ev(%"struct.Eigen::TensorEvaluator.5"* %6) #5 | |
%27 = bitcast %"struct.Eigen::TensorEvaluator.5"* %6 to i8* | |
call void @llvm.lifetime.end(i64 128, i8* %27) #9 | |
%28 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %28) #9 | |
%29 = bitcast i32* %4 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %29) #9 | |
ret void | |
} | |
; Function Attrs: argmemonly nounwind | |
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1 | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii(%"struct.Eigen::TensorEvaluator.5"* byval align 8, i32, i32, i32) #2 comdat align 2 { | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
store i32 %1, i32* %5, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%9 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
%10 = load i32, i32* %5, align 4 | |
store i32 %10, i32* %8, align 4 | |
br label %11 | |
; <label>:11: ; preds = %19, %4 | |
%12 = load i32, i32* %8, align 4 | |
%13 = load i32, i32* %6, align 4 | |
%14 = icmp slt i32 %12, %13 | |
br i1 %14, label %17, label %15 | |
; <label>:15: ; preds = %11 | |
%16 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %16) #9 | |
br label %23 | |
; <label>:17: ; preds = %11 | |
%18 = load i32, i32* %8, align 4 | |
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.5"* %0, i32 %18) #10 | |
br label %19 | |
; <label>:19: ; preds = %17 | |
%20 = load i32, i32* %7, align 4 | |
%21 = load i32, i32* %8, align 4 | |
%22 = add nsw i32 %21, %20 | |
store i32 %22, i32* %8, align 4 | |
br label %11 | |
; <label>:23: ; preds = %15 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEED1Ev(%"struct.Eigen::TensorEvaluator.5"*) unnamed_addr #4 comdat align 2 { | |
%2 = alloca %"struct.Eigen::TensorEvaluator.5"*, align 8 | |
store %"struct.Eigen::TensorEvaluator.5"* %0, %"struct.Eigen::TensorEvaluator.5"** %2, align 8 | |
%3 = load %"struct.Eigen::TensorEvaluator.5"*, %"struct.Eigen::TensorEvaluator.5"** %2, align 8 | |
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEED2Ev(%"struct.Eigen::TensorEvaluator.5"* %3) #5 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.5"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator.5"*, align 8 | |
%4 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator.5"* %0, %"struct.Eigen::TensorEvaluator.5"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%5 = load %"struct.Eigen::TensorEvaluator.5"*, %"struct.Eigen::TensorEvaluator.5"** %3, align 8 | |
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %5, i32 0, i32 0 | |
%7 = load i32, i32* %4, align 4 | |
%8 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator"* %6, i32 %7) #10 | |
%9 = load i32, i32* %4, align 4 | |
%10 = sext i32 %9 to i64 | |
%11 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %5, i32 0, i32 2 | |
%12 = load float*, float** %11, align 8 | |
%13 = getelementptr inbounds float, float* %12, i64 %10 | |
store float %8, float* %13, align 4 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator"*, align 8 | |
%4 = alloca i32, align 4 | |
%5 = alloca %"struct.Eigen::internal::SumReducer", align 1 | |
%6 = alloca float, align 4 | |
store %"struct.Eigen::TensorEvaluator"* %0, %"struct.Eigen::TensorEvaluator"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%7 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %3, align 8 | |
%8 = bitcast %"struct.Eigen::internal::SumReducer"* %5 to i8* | |
call void @llvm.lifetime.start(i64 1, i8* %8) #9 | |
%9 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %7, i32 0, i32 11 | |
%10 = bitcast float* %6 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %10) #9 | |
%11 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %5) #10 | |
store float %11, float* %6, align 4 | |
%12 = load i32, i32* %4, align 4 | |
%13 = call i32 @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEE10firstInputEi(%"struct.Eigen::TensorEvaluator"* %7, i32 %12) #10 | |
call void @_ZN5Eigen8internal17GenericDimReducerILi0ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator"* dereferenceable(112) %7, i32 %13, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %5, float* %6) #10 | |
%14 = load float, float* %6, align 4 | |
%15 = call float @_ZNK5Eigen8internal10SumReducerIfE8finalizeEf(%"struct.Eigen::internal::SumReducer"* %5, float %14) #10 | |
%16 = bitcast float* %6 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %16) #9 | |
%17 = bitcast %"struct.Eigen::internal::SumReducer"* %5 to i8* | |
call void @llvm.lifetime.end(i64 1, i8* %17) #9 | |
ret float %15 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen8internal17GenericDimReducerILi0ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator"* dereferenceable(112), i32, %"struct.Eigen::internal::SumReducer"* dereferenceable(1), float*) #4 comdat align 2 { | |
%5 = alloca %"struct.Eigen::TensorEvaluator"*, align 8 | |
%6 = alloca i32, align 4 | |
%7 = alloca %"struct.Eigen::internal::SumReducer"*, align 8 | |
%8 = alloca float*, align 8 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator"* %0, %"struct.Eigen::TensorEvaluator"** %5, align 8 | |
store i32 %1, i32* %6, align 4 | |
store %"struct.Eigen::internal::SumReducer"* %2, %"struct.Eigen::internal::SumReducer"** %7, align 8 | |
store float* %3, float** %8, align 8 | |
%11 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %11) #9 | |
store i32 0, i32* %9, align 4 | |
br label %12 | |
; <label>:12: ; preds = %36, %4 | |
%13 = load i32, i32* %9, align 4 | |
%14 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %5, align 8 | |
%15 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %14, i32 0, i32 9 | |
%16 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %15, i64 0) #10 | |
%17 = load i32, i32* %16, align 4 | |
%18 = icmp slt i32 %13, %17 | |
br i1 %18, label %21, label %19 | |
; <label>:19: ; preds = %12 | |
%20 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %20) #9 | |
br label %39 | |
; <label>:21: ; preds = %12 | |
%22 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %22) #9 | |
%23 = load i32, i32* %6, align 4 | |
%24 = load i32, i32* %9, align 4 | |
%25 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %5, align 8 | |
%26 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %25, i32 0, i32 8 | |
%27 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %26, i64 0) #10 | |
%28 = load i32, i32* %27, align 4 | |
%29 = mul nsw i32 %24, %28 | |
%30 = add nsw i32 %23, %29 | |
store i32 %30, i32* %10, align 4 | |
%31 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %5, align 8 | |
%32 = load i32, i32* %10, align 4 | |
%33 = load %"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::internal::SumReducer"** %7, align 8 | |
%34 = load float*, float** %8, align 8 | |
call void @_ZN5Eigen8internal17GenericDimReducerILin1ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator"* dereferenceable(112) %31, i32 %32, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %33, float* %34) #10 | |
%35 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %35) #9 | |
br label %36 | |
; <label>:36: ; preds = %21 | |
%37 = load i32, i32* %9, align 4 | |
%38 = add nsw i32 %37, 1 | |
store i32 %38, i32* %9, align 4 | |
br label %12 | |
; <label>:39: ; preds = %19 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr i32 @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEE10firstInputEi(%"struct.Eigen::TensorEvaluator"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator"*, align 8 | |
%4 = alloca i32, align 4 | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator"* %0, %"struct.Eigen::TensorEvaluator"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%8 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %3, align 8 | |
%9 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
store i32 0, i32* %5, align 4 | |
%10 = bitcast i32* %6 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %10) #9 | |
store i32 0, i32* %6, align 4 | |
br label %11 | |
; <label>:11: ; preds = %42, %2 | |
%12 = load i32, i32* %6, align 4 | |
%13 = icmp slt i32 %12, 0 | |
br i1 %13, label %16, label %14 | |
; <label>:14: ; preds = %11 | |
%15 = bitcast i32* %6 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %15) #9 | |
br label %45 | |
; <label>:16: ; preds = %11 | |
%17 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %17) #9 | |
%18 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %8, i32 0, i32 4 | |
%19 = load i32, i32* %6, align 4 | |
%20 = sext i32 %19 to i64 | |
%21 = call dereferenceable(12) %"struct.Eigen::internal::TensorIntDivisor"* @_ZNK5Eigen5arrayINS_8internal16TensorIntDivisorIiLb0EEELm1EEixEm(%"class.Eigen::array.2"* %18, i64 %20) #10 | |
%22 = call i32 @_ZN5Eigen8internaldvIiLb0EEET_RKS2_RKNS0_16TensorIntDivisorIS2_XT0_EEE(i32* dereferenceable(4) %4, %"struct.Eigen::internal::TensorIntDivisor"* dereferenceable(12) %21) #10 | |
store i32 %22, i32* %7, align 4 | |
%23 = load i32, i32* %7, align 4 | |
%24 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %8, i32 0, i32 5 | |
%25 = load i32, i32* %6, align 4 | |
%26 = sext i32 %25 to i64 | |
%27 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %24, i64 %26) #10 | |
%28 = load i32, i32* %27, align 4 | |
%29 = mul nsw i32 %23, %28 | |
%30 = load i32, i32* %5, align 4 | |
%31 = add nsw i32 %30, %29 | |
store i32 %31, i32* %5, align 4 | |
%32 = load i32, i32* %7, align 4 | |
%33 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %8, i32 0, i32 3 | |
%34 = load i32, i32* %6, align 4 | |
%35 = sext i32 %34 to i64 | |
%36 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %33, i64 %35) #10 | |
%37 = load i32, i32* %36, align 4 | |
%38 = mul nsw i32 %32, %37 | |
%39 = load i32, i32* %4, align 4 | |
%40 = sub nsw i32 %39, %38 | |
store i32 %40, i32* %4, align 4 | |
%41 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %41) #9 | |
br label %42 | |
; <label>:42: ; preds = %16 | |
%43 = load i32, i32* %6, align 4 | |
%44 = add nsw i32 %43, 1 | |
store i32 %44, i32* %6, align 4 | |
br label %11 | |
; <label>:45: ; preds = %14 | |
%46 = load i32, i32* %4, align 4 | |
%47 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %8, i32 0, i32 7 | |
%48 = load i32, i32* %47, align 8 | |
%49 = mul nsw i32 %46, %48 | |
%50 = load i32, i32* %5, align 4 | |
%51 = add nsw i32 %50, %49 | |
store i32 %51, i32* %5, align 4 | |
%52 = load i32, i32* %5, align 4 | |
%53 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %53) #9 | |
ret i32 %52 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr float @_ZNK5Eigen8internal10SumReducerIfE8finalizeEf(%"struct.Eigen::internal::SumReducer"*, float) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::internal::SumReducer"*, align 8 | |
%4 = alloca float, align 4 | |
store %"struct.Eigen::internal::SumReducer"* %0, %"struct.Eigen::internal::SumReducer"** %3, align 8 | |
store float %1, float* %4, align 4 | |
%5 = load %"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::internal::SumReducer"** %3, align 8 | |
%6 = load float, float* %4, align 4 | |
ret float %6 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"*, i64) #4 comdat align 2 { | |
%3 = alloca %"class.Eigen::array.0"*, align 8 | |
%4 = alloca i64, align 8 | |
store %"class.Eigen::array.0"* %0, %"class.Eigen::array.0"** %3, align 8 | |
store i64 %1, i64* %4, align 8 | |
%5 = load %"class.Eigen::array.0"*, %"class.Eigen::array.0"** %3, align 8 | |
%6 = load i64, i64* %4, align 8 | |
%7 = getelementptr inbounds %"class.Eigen::array.0", %"class.Eigen::array.0"* %5, i32 0, i32 0 | |
%8 = getelementptr inbounds [1 x i32], [1 x i32]* %7, i64 0, i64 %6 | |
ret i32* %8 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen8internal17GenericDimReducerILin1ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator"* dereferenceable(112), i32, %"struct.Eigen::internal::SumReducer"* dereferenceable(1), float*) #4 comdat align 2 { | |
%5 = alloca %"struct.Eigen::TensorEvaluator"*, align 8 | |
%6 = alloca i32, align 4 | |
%7 = alloca %"struct.Eigen::internal::SumReducer"*, align 8 | |
%8 = alloca float*, align 8 | |
store %"struct.Eigen::TensorEvaluator"* %0, %"struct.Eigen::TensorEvaluator"** %5, align 8 | |
store i32 %1, i32* %6, align 4 | |
store %"struct.Eigen::internal::SumReducer"* %2, %"struct.Eigen::internal::SumReducer"** %7, align 8 | |
store float* %3, float** %8, align 8 | |
%9 = load %"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::internal::SumReducer"** %7, align 8 | |
%10 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %5, align 8 | |
%11 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %10, i32 0, i32 10 | |
%12 = load i32, i32* %6, align 4 | |
%13 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %11, i32 %12) #10 | |
%14 = load float*, float** %8, align 8 | |
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %9, float %13, float* %14) #10 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define internal i32 @_ZN5Eigen8internaldvIiLb0EEET_RKS2_RKNS0_16TensorIntDivisorIS2_XT0_EEE(i32* dereferenceable(4), %"struct.Eigen::internal::TensorIntDivisor"* dereferenceable(12)) #4 { | |
%3 = alloca i32*, align 8 | |
%4 = alloca %"struct.Eigen::internal::TensorIntDivisor"*, align 8 | |
store i32* %0, i32** %3, align 8 | |
store %"struct.Eigen::internal::TensorIntDivisor"* %1, %"struct.Eigen::internal::TensorIntDivisor"** %4, align 8 | |
%5 = load %"struct.Eigen::internal::TensorIntDivisor"*, %"struct.Eigen::internal::TensorIntDivisor"** %4, align 8 | |
%6 = load i32*, i32** %3, align 8 | |
%7 = load i32, i32* %6, align 4 | |
%8 = call i32 @_ZNK5Eigen8internal16TensorIntDivisorIiLb0EE6divideEi(%"struct.Eigen::internal::TensorIntDivisor"* %5, i32 %7) #10 | |
ret i32 %8 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr dereferenceable(12) %"struct.Eigen::internal::TensorIntDivisor"* @_ZNK5Eigen5arrayINS_8internal16TensorIntDivisorIiLb0EEELm1EEixEm(%"class.Eigen::array.2"*, i64) #4 comdat align 2 { | |
%3 = alloca %"class.Eigen::array.2"*, align 8 | |
%4 = alloca i64, align 8 | |
store %"class.Eigen::array.2"* %0, %"class.Eigen::array.2"** %3, align 8 | |
store i64 %1, i64* %4, align 8 | |
%5 = load %"class.Eigen::array.2"*, %"class.Eigen::array.2"** %3, align 8 | |
%6 = load i64, i64* %4, align 8 | |
%7 = getelementptr inbounds %"class.Eigen::array.2", %"class.Eigen::array.2"* %5, i32 0, i32 0 | |
%8 = getelementptr inbounds [1 x %"struct.Eigen::internal::TensorIntDivisor"], [1 x %"struct.Eigen::internal::TensorIntDivisor"]* %7, i64 0, i64 %6 | |
ret %"struct.Eigen::internal::TensorIntDivisor"* %8 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr i32 @_ZNK5Eigen8internal16TensorIntDivisorIiLb0EE6divideEi(%"struct.Eigen::internal::TensorIntDivisor"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::internal::TensorIntDivisor"*, align 8 | |
%4 = alloca i32, align 4 | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
store %"struct.Eigen::internal::TensorIntDivisor"* %0, %"struct.Eigen::internal::TensorIntDivisor"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%7 = load %"struct.Eigen::internal::TensorIntDivisor"*, %"struct.Eigen::internal::TensorIntDivisor"** %3, align 8 | |
%8 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %8) #9 | |
%9 = getelementptr inbounds %"struct.Eigen::internal::TensorIntDivisor", %"struct.Eigen::internal::TensorIntDivisor"* %7, i32 0, i32 0 | |
%10 = load i32, i32* %9, align 4 | |
%11 = load i32, i32* %4, align 4 | |
%12 = call i32 @_ZN5Eigen8internal12_GLOBAL__N_15muluhIiEEjjT_(i32 %10, i32 %11) #10 | |
store i32 %12, i32* %5, align 4 | |
%13 = bitcast i32* %6 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %13) #9 | |
%14 = load i32, i32* %4, align 4 | |
%15 = load i32, i32* %5, align 4 | |
%16 = sub i32 %14, %15 | |
%17 = getelementptr inbounds %"struct.Eigen::internal::TensorIntDivisor", %"struct.Eigen::internal::TensorIntDivisor"* %7, i32 0, i32 1 | |
%18 = load i32, i32* %17, align 4 | |
%19 = lshr i32 %16, %18 | |
store i32 %19, i32* %6, align 4 | |
%20 = load i32, i32* %5, align 4 | |
%21 = load i32, i32* %6, align 4 | |
%22 = add i32 %20, %21 | |
%23 = getelementptr inbounds %"struct.Eigen::internal::TensorIntDivisor", %"struct.Eigen::internal::TensorIntDivisor"* %7, i32 0, i32 2 | |
%24 = load i32, i32* %23, align 4 | |
%25 = lshr i32 %22, %24 | |
%26 = bitcast i32* %6 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %26) #9 | |
%27 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %27) #9 | |
ret i32 %25 | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define internal i32 @_ZN5Eigen8internal12_GLOBAL__N_15muluhIiEEjjT_(i32, i32) #2 { | |
%3 = alloca i32, align 4 | |
%4 = alloca i32, align 4 | |
store i32 %0, i32* %3, align 4 | |
store i32 %1, i32* %4, align 4 | |
%5 = load i32, i32* %3, align 4 | |
%6 = load i32, i32* %4, align 4 | |
%7 = call i32 @_ZL8__umulhijj(i32 %5, i32 %6) #10 | |
ret i32 %7 | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define internal i32 @_ZL8__umulhijj(i32, i32) #2 { | |
%3 = alloca i32, align 4 | |
%4 = alloca i32, align 4 | |
store i32 %0, i32* %3, align 4 | |
store i32 %1, i32* %4, align 4 | |
%5 = load i32, i32* %3, align 4 | |
%6 = load i32, i32* %4, align 4 | |
%7 = call i32 @__nv_umulhi(i32 %5, i32 %6) #10 | |
ret i32 %7 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEED2Ev(%"struct.Eigen::TensorEvaluator.5"*) unnamed_addr #4 comdat align 2 { | |
%2 = alloca %"struct.Eigen::TensorEvaluator.5"*, align 8 | |
store %"struct.Eigen::TensorEvaluator.5"* %0, %"struct.Eigen::TensorEvaluator.5"** %2, align 8 | |
%3 = load %"struct.Eigen::TensorEvaluator.5"*, %"struct.Eigen::TensorEvaluator.5"** %2, align 8 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_(float, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat { | |
%4 = alloca float, align 4 | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
store float %0, float* %4, align 4 | |
store i32 %1, i32* %5, align 4 | |
%8 = bitcast i32* %6 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %8) #9 | |
%9 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%10 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%11 = mul i32 %9, %10 | |
%12 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%13 = add i32 %11, %12 | |
store i32 %13, i32* %6, align 4 | |
%14 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %14) #9 | |
%15 = load i32, i32* %6, align 4 | |
store i32 %15, i32* %7, align 4 | |
br label %16 | |
; <label>:16: ; preds = %26, %3 | |
%17 = load i32, i32* %7, align 4 | |
%18 = load i32, i32* %5, align 4 | |
%19 = icmp slt i32 %17, %18 | |
br i1 %19, label %22, label %20 | |
; <label>:20: ; preds = %16 | |
%21 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %21) #9 | |
br label %32 | |
; <label>:22: ; preds = %16 | |
%23 = load float, float* %4, align 4 | |
%24 = load i32, i32* %7, align 4 | |
%25 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %2, i32 %24) #10 | |
store float %23, float* %25, align 4 | |
br label %26 | |
; <label>:26: ; preds = %22 | |
%27 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%29 = mul i32 %27, %28 | |
%30 = load i32, i32* %7, align 4 | |
%31 = add i32 %30, %29 | |
store i32 %31, i32* %7, align 4 | |
br label %16 | |
; <label>:32: ; preds = %20 | |
%33 = bitcast i32* %6 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %33) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define linkonce_odr dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"*, i32) #0 comdat align 2 { | |
%3 = alloca %"struct.Eigen::internal::PtrWrapper"*, align 8 | |
%4 = alloca i32, align 4 | |
store %"struct.Eigen::internal::PtrWrapper"* %0, %"struct.Eigen::internal::PtrWrapper"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%5 = load %"struct.Eigen::internal::PtrWrapper"*, %"struct.Eigen::internal::PtrWrapper"** %3, align 8 | |
%6 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %5, i32 0, i32 0 | |
%7 = load float*, float** %6, align 8 | |
%8 = load i32, i32* %4, align 4 | |
%9 = sext i32 %8 to i64 | |
%10 = getelementptr inbounds float, float* %7, i64 %9 | |
ret float* %10 | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32, align 4 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32 | |
%15 = alloca i32, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca i32, align 4 | |
%18 = alloca i32, align 4 | |
%19 = alloca float, align 4 | |
%20 = alloca i32, align 4 | |
%21 = alloca i32, align 4 | |
%22 = alloca i32, align 4 | |
%23 = alloca i32, align 4 | |
%24 = alloca i32, align 4 | |
%25 = alloca i32, align 4 | |
%26 = alloca i32, align 4 | |
%27 = alloca i32, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%29 = icmp eq i32 %28, 256 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %5 | |
br label %32 | |
; <label>:31: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%34 = icmp eq i32 %33, 1 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%44 = icmp eq i32 %43, 32 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%49 = icmp eq i32 %48, 1 | |
br i1 %49, label %50, label %51 | |
; <label>:50: ; preds = %47 | |
br label %52 | |
; <label>:51: ; preds = %47 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %52 | |
; <label>:52: ; preds = %51, %50 | |
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%54 = icmp eq i32 %53, 1 | |
br i1 %54, label %55, label %56 | |
; <label>:55: ; preds = %52 | |
br label %57 | |
; <label>:56: ; preds = %52 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %57 | |
; <label>:57: ; preds = %56, %55 | |
%58 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %58) #9 | |
store i32 16, i32* %8, align 4 | |
%59 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %7, align 4 | |
%61 = add nsw i32 %60, 32768 | |
%62 = sub nsw i32 %61, 1 | |
%63 = sdiv i32 %62, 32768 | |
store i32 %63, i32* %9, align 4 | |
%64 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %64) #9 | |
%65 = load i32, i32* %9, align 4 | |
%66 = load i32, i32* %6, align 4 | |
%67 = mul nsw i32 %65, %66 | |
store i32 %67, i32* %10, align 4 | |
%68 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %68) #9 | |
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %69, i32* %11, align 4 | |
%70 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %70) #9 | |
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %71, i32* %12, align 4 | |
%72 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %72) #9 | |
%73 = load i32, i32* %11, align 4 | |
store i32 %73, i32* %13, align 4 | |
br label %74 | |
; <label>:74: ; preds = %237, %57 | |
%75 = load i32, i32* %13, align 4 | |
%76 = load i32, i32* %10, align 4 | |
%77 = icmp slt i32 %75, %76 | |
br i1 %77, label %80, label %78 | |
; <label>:78: ; preds = %74 | |
store i32 2, i32* %14, align 4 | |
%79 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %79) #9 | |
br label %240 | |
; <label>:80: ; preds = %74 | |
%81 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %81) #9 | |
%82 = load i32, i32* %13, align 4 | |
%83 = load i32, i32* %9, align 4 | |
%84 = srem i32 %82, %83 | |
store i32 %84, i32* %15, align 4 | |
%85 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = load i32, i32* %13, align 4 | |
%87 = load i32, i32* %9, align 4 | |
%88 = sdiv i32 %86, %87 | |
store i32 %88, i32* %16, align 4 | |
%89 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %89) #9 | |
%90 = load i32, i32* %15, align 4 | |
%91 = mul nsw i32 %90, 256 | |
%92 = mul nsw i32 %91, 128 | |
%93 = load i32, i32* %12, align 4 | |
%94 = add nsw i32 %92, %93 | |
store i32 %94, i32* %17, align 4 | |
%95 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %95) #9 | |
%96 = load i32, i32* %16, align 4 | |
store i32 %96, i32* %18, align 4 | |
%97 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %97) #9 | |
%98 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%99 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %98) #10 | |
store float %99, float* %19, align 4 | |
%100 = load i32, i32* %18, align 4 | |
%101 = load i32, i32* %6, align 4 | |
%102 = icmp slt i32 %100, %101 | |
br i1 %102, label %103, label %198 | |
; <label>:103: ; preds = %80 | |
%104 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %104) #9 | |
store i32 0, i32* %20, align 4 | |
br label %105 | |
; <label>:105: ; preds = %192, %103 | |
%106 = load i32, i32* %20, align 4 | |
%107 = icmp slt i32 %106, 128 | |
br i1 %107, label %109, label %108 | |
; <label>:108: ; preds = %105 | |
store i32 5, i32* %14, align 4 | |
br label %195 | |
; <label>:109: ; preds = %105 | |
%110 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %110) #9 | |
%111 = load i32, i32* %17, align 4 | |
%112 = load i32, i32* %20, align 4 | |
%113 = add nsw i32 %112, 16 | |
%114 = sub nsw i32 %113, 1 | |
%115 = mul nsw i32 256, %114 | |
%116 = add nsw i32 %111, %115 | |
store i32 %116, i32* %21, align 4 | |
%117 = load i32, i32* %21, align 4 | |
%118 = load i32, i32* %7, align 4 | |
%119 = icmp sge i32 %117, %118 | |
br i1 %119, label %120, label %158 | |
; <label>:120: ; preds = %109 | |
%121 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %121) #9 | |
store i32 0, i32* %22, align 4 | |
br label %122 | |
; <label>:122: ; preds = %152, %120 | |
%123 = load i32, i32* %22, align 4 | |
%124 = icmp slt i32 %123, 15 | |
br i1 %124, label %126, label %125 | |
; <label>:125: ; preds = %122 | |
store i32 8, i32* %14, align 4 | |
br label %155 | |
; <label>:126: ; preds = %122 | |
%127 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %127) #9 | |
%128 = load i32, i32* %17, align 4 | |
%129 = load i32, i32* %20, align 4 | |
%130 = load i32, i32* %22, align 4 | |
%131 = add nsw i32 %129, %130 | |
%132 = mul nsw i32 256, %131 | |
%133 = add nsw i32 %128, %132 | |
store i32 %133, i32* %23, align 4 | |
%134 = load i32, i32* %23, align 4 | |
%135 = load i32, i32* %7, align 4 | |
%136 = icmp sge i32 %134, %135 | |
br i1 %136, label %137, label %138 | |
; <label>:137: ; preds = %126 | |
store i32 8, i32* %14, align 4 | |
br label %148 | |
; <label>:138: ; preds = %126 | |
%139 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%140 = load float, float* %19, align 4 | |
%141 = load i32, i32* %18, align 4 | |
%142 = load i32, i32* %7, align 4 | |
%143 = mul nsw i32 %141, %142 | |
%144 = load i32, i32* %23, align 4 | |
%145 = add nsw i32 %143, %144 | |
%146 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %145) #10 | |
%147 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %139, float %140, float %146) #10 | |
store float %147, float* %19, align 4 | |
store i32 0, i32* %14, align 4 | |
br label %148 | |
; <label>:148: ; preds = %138, %137 | |
%149 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %149) #9 | |
%150 = load i32, i32* %14, align 4 | |
switch i32 %150, label %155 [ | |
i32 0, label %151 | |
] | |
; <label>:151: ; preds = %148 | |
br label %152 | |
; <label>:152: ; preds = %151 | |
%153 = load i32, i32* %22, align 4 | |
%154 = add nsw i32 %153, 1 | |
store i32 %154, i32* %22, align 4 | |
br label %122, !llvm.loop !52 | |
; <label>:155: ; preds = %148, %125 | |
%156 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %156) #9 | |
br label %157 | |
; <label>:157: ; preds = %155 | |
store i32 5, i32* %14, align 4 | |
br label %188 | |
; <label>:158: ; preds = %109 | |
%159 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %159) #9 | |
store i32 0, i32* %24, align 4 | |
br label %160 | |
; <label>:160: ; preds = %183, %158 | |
%161 = load i32, i32* %24, align 4 | |
%162 = icmp slt i32 %161, 16 | |
br i1 %162, label %165, label %163 | |
; <label>:163: ; preds = %160 | |
store i32 11, i32* %14, align 4 | |
%164 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %164) #9 | |
br label %186 | |
; <label>:165: ; preds = %160 | |
%166 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %166) #9 | |
%167 = load i32, i32* %17, align 4 | |
%168 = load i32, i32* %20, align 4 | |
%169 = load i32, i32* %24, align 4 | |
%170 = add nsw i32 %168, %169 | |
%171 = mul nsw i32 256, %170 | |
%172 = add nsw i32 %167, %171 | |
store i32 %172, i32* %25, align 4 | |
%173 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%174 = load float, float* %19, align 4 | |
%175 = load i32, i32* %18, align 4 | |
%176 = load i32, i32* %7, align 4 | |
%177 = mul nsw i32 %175, %176 | |
%178 = load i32, i32* %25, align 4 | |
%179 = add nsw i32 %177, %178 | |
%180 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %179) #10 | |
%181 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %173, float %174, float %180) #10 | |
store float %181, float* %19, align 4 | |
%182 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %182) #9 | |
br label %183 | |
; <label>:183: ; preds = %165 | |
%184 = load i32, i32* %24, align 4 | |
%185 = add nsw i32 %184, 1 | |
store i32 %185, i32* %24, align 4 | |
br label %160, !llvm.loop !53 | |
; <label>:186: ; preds = %163 | |
br label %187 | |
; <label>:187: ; preds = %186 | |
store i32 0, i32* %14, align 4 | |
br label %188 | |
; <label>:188: ; preds = %187, %157 | |
%189 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %189) #9 | |
%190 = load i32, i32* %14, align 4 | |
switch i32 %190, label %195 [ | |
i32 0, label %191 | |
] | |
; <label>:191: ; preds = %188 | |
br label %192 | |
; <label>:192: ; preds = %191 | |
%193 = load i32, i32* %20, align 4 | |
%194 = add nsw i32 %193, 16 | |
store i32 %194, i32* %20, align 4 | |
br label %105, !llvm.loop !54 | |
; <label>:195: ; preds = %188, %108 | |
%196 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %196) #9 | |
br label %197 | |
; <label>:197: ; preds = %195 | |
br label %198 | |
; <label>:198: ; preds = %197, %80 | |
%199 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %199) #9 | |
store i32 16, i32* %26, align 4 | |
br label %200 | |
; <label>:200: ; preds = %212, %198 | |
%201 = load i32, i32* %26, align 4 | |
%202 = icmp sgt i32 %201, 0 | |
br i1 %202, label %205, label %203 | |
; <label>:203: ; preds = %200 | |
store i32 14, i32* %14, align 4 | |
%204 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %204) #9 | |
br label %215 | |
; <label>:205: ; preds = %200 | |
%206 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%207 = load float, float* %19, align 4 | |
%208 = load i32, i32* %26, align 4 | |
%209 = call float @_ZL11__shfl_downfji(float %207, i32 %208, i32 32) #10 | |
%210 = load float, float* %19, align 4 | |
%211 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %206, float %209, float %210) #10 | |
store float %211, float* %19, align 4 | |
br label %212 | |
; <label>:212: ; preds = %205 | |
%213 = load i32, i32* %26, align 4 | |
%214 = sdiv i32 %213, 2 | |
store i32 %214, i32* %26, align 4 | |
br label %200, !llvm.loop !56 | |
; <label>:215: ; preds = %203 | |
%216 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %216) #9 | |
%217 = load i32, i32* %12, align 4 | |
%218 = and i32 %217, 31 | |
store i32 %218, i32* %27, align 4 | |
%219 = load i32, i32* %27, align 4 | |
%220 = icmp eq i32 %219, 0 | |
br i1 %220, label %221, label %230 | |
; <label>:221: ; preds = %215 | |
%222 = load i32, i32* %18, align 4 | |
%223 = load i32, i32* %6, align 4 | |
%224 = icmp slt i32 %222, %223 | |
br i1 %224, label %225, label %230 | |
; <label>:225: ; preds = %221 | |
%226 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%227 = load i32, i32* %18, align 4 | |
%228 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %227) #10 | |
%229 = load float, float* %19, align 4 | |
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %226, float* %228, float %229) #10 | |
br label %230 | |
; <label>:230: ; preds = %225, %221, %215 | |
%231 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %231) #9 | |
%232 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %232) #9 | |
%233 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %233) #9 | |
%234 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %234) #9 | |
%235 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %235) #9 | |
%236 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %236) #9 | |
br label %237 | |
; <label>:237: ; preds = %230 | |
%238 = load i32, i32* %13, align 4 | |
%239 = add nsw i32 %238, 32 | |
store i32 %239, i32* %13, align 4 | |
br label %74 | |
; <label>:240: ; preds = %78 | |
%241 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %241) #9 | |
%242 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %242) #9 | |
%243 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %243) #9 | |
%244 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %244) #9 | |
%245 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %245) #9 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define internal void @_ZL13__assert_failPKcS0_jS0_(i8*, i8*, i32, i8*) #4 { | |
%5 = alloca i8*, align 8 | |
%6 = alloca i8*, align 8 | |
%7 = alloca i32, align 4 | |
%8 = alloca i8*, align 8 | |
store i8* %0, i8** %5, align 8 | |
store i8* %1, i8** %6, align 8 | |
store i32 %2, i32* %7, align 4 | |
store i8* %3, i8** %8, align 8 | |
%9 = load i8*, i8** %5, align 8 | |
%10 = load i8*, i8** %6, align 8 | |
%11 = load i32, i32* %7, align 4 | |
%12 = load i8*, i8** %8, align 8 | |
call void @__assertfail(i8* %9, i8* %10, i32 %11, i8* %12, i64 1) #11 | |
unreachable | |
; No predecessors! | |
ret void | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #2 comdat align 2 { | |
%1 = call i32 @llvm.ptx.read.ntid.y() | |
ret i32 %1 | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #2 comdat align 2 { | |
%1 = call i32 @llvm.ptx.read.ntid.z() | |
ret i32 %1 | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #2 comdat align 2 { | |
%1 = call i32 @llvm.ptx.read.nctaid.y() | |
ret i32 %1 | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #2 comdat align 2 { | |
%1 = call i32 @llvm.ptx.read.nctaid.z() | |
ret i32 %1 | |
} | |
; Function Attrs: convergent nounwind | |
define internal float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*) #0 align 2 { | |
%2 = alloca %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, align 8 | |
store %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %0, %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"** %2, align 8 | |
%3 = load %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"** %2, align 8 | |
ret float 0.000000e+00 | |
} | |
; Function Attrs: convergent nounwind | |
define internal float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, float, float) #0 align 2 { | |
%4 = alloca %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, align 8 | |
%5 = alloca float, align 4 | |
%6 = alloca float, align 4 | |
store %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %0, %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"** %4, align 8 | |
store float %1, float* %5, align 4 | |
store float %2, float* %6, align 4 | |
%7 = load %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"** %4, align 8 | |
%8 = load float, float* %5, align 4 | |
%9 = getelementptr inbounds %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer", %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %7, i32 0, i32 0 | |
%10 = load float, float* %6, align 4 | |
%11 = call float @_ZNK5Eigen8internal12_GLOBAL__N_18IdentityclEf(%"struct.Eigen::internal::(anonymous namespace)::Identity"* %9, float %10) #10 | |
%12 = fadd float %8, %11 | |
ret float %12 | |
} | |
; Function Attrs: convergent nounwind | |
define internal void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, float*, float) #0 align 2 { | |
%4 = alloca %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, align 8 | |
%5 = alloca float*, align 8 | |
%6 = alloca float, align 4 | |
store %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %0, %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"** %4, align 8 | |
store float* %1, float** %5, align 8 | |
store float %2, float* %6, align 4 | |
%7 = load %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"** %4, align 8 | |
%8 = load float*, float** %5, align 8 | |
%9 = load float, float* %6, align 4 | |
%10 = call float @_ZL9atomicAddPff(float* %8, float %9) #10 | |
ret void | |
} | |
; Function Attrs: convergent noreturn | |
declare void @__assertfail(i8*, i8*, i32, i8*, i64) #7 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.ntid.y() #3 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.ntid.z() #3 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.nctaid.y() #3 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.nctaid.z() #3 | |
; Function Attrs: convergent nounwind | |
define internal float @_ZNK5Eigen8internal12_GLOBAL__N_18IdentityclEf(%"struct.Eigen::internal::(anonymous namespace)::Identity"*, float) #0 align 2 { | |
%3 = alloca %"struct.Eigen::internal::(anonymous namespace)::Identity"*, align 8 | |
%4 = alloca float, align 4 | |
store %"struct.Eigen::internal::(anonymous namespace)::Identity"* %0, %"struct.Eigen::internal::(anonymous namespace)::Identity"** %3, align 8 | |
store float %1, float* %4, align 4 | |
%5 = load %"struct.Eigen::internal::(anonymous namespace)::Identity"*, %"struct.Eigen::internal::(anonymous namespace)::Identity"** %3, align 8 | |
%6 = load float, float* %4, align 4 | |
ret float %6 | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32, align 4 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32 | |
%15 = alloca i32, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca i32, align 4 | |
%18 = alloca i32, align 4 | |
%19 = alloca float, align 4 | |
%20 = alloca i32, align 4 | |
%21 = alloca i32, align 4 | |
%22 = alloca i32, align 4 | |
%23 = alloca i32, align 4 | |
%24 = alloca i32, align 4 | |
%25 = alloca i32, align 4 | |
%26 = alloca i32, align 4 | |
%27 = alloca i32, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%29 = icmp eq i32 %28, 256 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %5 | |
br label %32 | |
; <label>:31: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%34 = icmp eq i32 %33, 1 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%44 = icmp eq i32 %43, 32 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%49 = icmp eq i32 %48, 1 | |
br i1 %49, label %50, label %51 | |
; <label>:50: ; preds = %47 | |
br label %52 | |
; <label>:51: ; preds = %47 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %52 | |
; <label>:52: ; preds = %51, %50 | |
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%54 = icmp eq i32 %53, 1 | |
br i1 %54, label %55, label %56 | |
; <label>:55: ; preds = %52 | |
br label %57 | |
; <label>:56: ; preds = %52 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %57 | |
; <label>:57: ; preds = %56, %55 | |
%58 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %58) #9 | |
store i32 16, i32* %8, align 4 | |
%59 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %7, align 4 | |
%61 = add nsw i32 %60, 32768 | |
%62 = sub nsw i32 %61, 1 | |
%63 = sdiv i32 %62, 32768 | |
store i32 %63, i32* %9, align 4 | |
%64 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %64) #9 | |
%65 = load i32, i32* %9, align 4 | |
%66 = load i32, i32* %6, align 4 | |
%67 = mul nsw i32 %65, %66 | |
store i32 %67, i32* %10, align 4 | |
%68 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %68) #9 | |
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %69, i32* %11, align 4 | |
%70 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %70) #9 | |
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %71, i32* %12, align 4 | |
%72 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %72) #9 | |
%73 = load i32, i32* %11, align 4 | |
store i32 %73, i32* %13, align 4 | |
br label %74 | |
; <label>:74: ; preds = %232, %57 | |
%75 = load i32, i32* %13, align 4 | |
%76 = load i32, i32* %10, align 4 | |
%77 = icmp slt i32 %75, %76 | |
br i1 %77, label %80, label %78 | |
; <label>:78: ; preds = %74 | |
store i32 2, i32* %14, align 4 | |
%79 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %79) #9 | |
br label %235 | |
; <label>:80: ; preds = %74 | |
%81 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %81) #9 | |
%82 = load i32, i32* %13, align 4 | |
%83 = load i32, i32* %9, align 4 | |
%84 = srem i32 %82, %83 | |
store i32 %84, i32* %15, align 4 | |
%85 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = load i32, i32* %13, align 4 | |
%87 = load i32, i32* %9, align 4 | |
%88 = sdiv i32 %86, %87 | |
store i32 %88, i32* %16, align 4 | |
%89 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %89) #9 | |
%90 = load i32, i32* %15, align 4 | |
%91 = mul nsw i32 %90, 256 | |
%92 = mul nsw i32 %91, 128 | |
%93 = load i32, i32* %12, align 4 | |
%94 = add nsw i32 %92, %93 | |
store i32 %94, i32* %17, align 4 | |
%95 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %95) #9 | |
%96 = load i32, i32* %16, align 4 | |
store i32 %96, i32* %18, align 4 | |
%97 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %97) #9 | |
%98 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10 | |
store float %98, float* %19, align 4 | |
%99 = load i32, i32* %18, align 4 | |
%100 = load i32, i32* %6, align 4 | |
%101 = icmp slt i32 %99, %100 | |
br i1 %101, label %102, label %195 | |
; <label>:102: ; preds = %80 | |
%103 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %103) #9 | |
store i32 0, i32* %20, align 4 | |
br label %104 | |
; <label>:104: ; preds = %189, %102 | |
%105 = load i32, i32* %20, align 4 | |
%106 = icmp slt i32 %105, 128 | |
br i1 %106, label %108, label %107 | |
; <label>:107: ; preds = %104 | |
store i32 5, i32* %14, align 4 | |
br label %192 | |
; <label>:108: ; preds = %104 | |
%109 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %109) #9 | |
%110 = load i32, i32* %17, align 4 | |
%111 = load i32, i32* %20, align 4 | |
%112 = add nsw i32 %111, 16 | |
%113 = sub nsw i32 %112, 1 | |
%114 = mul nsw i32 256, %113 | |
%115 = add nsw i32 %110, %114 | |
store i32 %115, i32* %21, align 4 | |
%116 = load i32, i32* %21, align 4 | |
%117 = load i32, i32* %7, align 4 | |
%118 = icmp sge i32 %116, %117 | |
br i1 %118, label %119, label %156 | |
; <label>:119: ; preds = %108 | |
%120 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %120) #9 | |
store i32 0, i32* %22, align 4 | |
br label %121 | |
; <label>:121: ; preds = %150, %119 | |
%122 = load i32, i32* %22, align 4 | |
%123 = icmp slt i32 %122, 15 | |
br i1 %123, label %125, label %124 | |
; <label>:124: ; preds = %121 | |
store i32 8, i32* %14, align 4 | |
br label %153 | |
; <label>:125: ; preds = %121 | |
%126 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %126) #9 | |
%127 = load i32, i32* %17, align 4 | |
%128 = load i32, i32* %20, align 4 | |
%129 = load i32, i32* %22, align 4 | |
%130 = add nsw i32 %128, %129 | |
%131 = mul nsw i32 256, %130 | |
%132 = add nsw i32 %127, %131 | |
store i32 %132, i32* %23, align 4 | |
%133 = load i32, i32* %23, align 4 | |
%134 = load i32, i32* %7, align 4 | |
%135 = icmp sge i32 %133, %134 | |
br i1 %135, label %136, label %137 | |
; <label>:136: ; preds = %125 | |
store i32 8, i32* %14, align 4 | |
br label %146 | |
; <label>:137: ; preds = %125 | |
%138 = load float, float* %19, align 4 | |
%139 = load i32, i32* %18, align 4 | |
%140 = load i32, i32* %7, align 4 | |
%141 = mul nsw i32 %139, %140 | |
%142 = load i32, i32* %23, align 4 | |
%143 = add nsw i32 %141, %142 | |
%144 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %143) #10 | |
%145 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %138, float %144) #10 | |
store float %145, float* %19, align 4 | |
store i32 0, i32* %14, align 4 | |
br label %146 | |
; <label>:146: ; preds = %137, %136 | |
%147 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %147) #9 | |
%148 = load i32, i32* %14, align 4 | |
switch i32 %148, label %153 [ | |
i32 0, label %149 | |
] | |
; <label>:149: ; preds = %146 | |
br label %150 | |
; <label>:150: ; preds = %149 | |
%151 = load i32, i32* %22, align 4 | |
%152 = add nsw i32 %151, 1 | |
store i32 %152, i32* %22, align 4 | |
br label %121, !llvm.loop !57 | |
; <label>:153: ; preds = %146, %124 | |
%154 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %154) #9 | |
br label %155 | |
; <label>:155: ; preds = %153 | |
store i32 5, i32* %14, align 4 | |
br label %185 | |
; <label>:156: ; preds = %108 | |
%157 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %157) #9 | |
store i32 0, i32* %24, align 4 | |
br label %158 | |
; <label>:158: ; preds = %180, %156 | |
%159 = load i32, i32* %24, align 4 | |
%160 = icmp slt i32 %159, 16 | |
br i1 %160, label %163, label %161 | |
; <label>:161: ; preds = %158 | |
store i32 11, i32* %14, align 4 | |
%162 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %162) #9 | |
br label %183 | |
; <label>:163: ; preds = %158 | |
%164 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %164) #9 | |
%165 = load i32, i32* %17, align 4 | |
%166 = load i32, i32* %20, align 4 | |
%167 = load i32, i32* %24, align 4 | |
%168 = add nsw i32 %166, %167 | |
%169 = mul nsw i32 256, %168 | |
%170 = add nsw i32 %165, %169 | |
store i32 %170, i32* %25, align 4 | |
%171 = load float, float* %19, align 4 | |
%172 = load i32, i32* %18, align 4 | |
%173 = load i32, i32* %7, align 4 | |
%174 = mul nsw i32 %172, %173 | |
%175 = load i32, i32* %25, align 4 | |
%176 = add nsw i32 %174, %175 | |
%177 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %176) #10 | |
%178 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %171, float %177) #10 | |
store float %178, float* %19, align 4 | |
%179 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %179) #9 | |
br label %180 | |
; <label>:180: ; preds = %163 | |
%181 = load i32, i32* %24, align 4 | |
%182 = add nsw i32 %181, 1 | |
store i32 %182, i32* %24, align 4 | |
br label %158, !llvm.loop !58 | |
; <label>:183: ; preds = %161 | |
br label %184 | |
; <label>:184: ; preds = %183 | |
store i32 0, i32* %14, align 4 | |
br label %185 | |
; <label>:185: ; preds = %184, %155 | |
%186 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %186) #9 | |
%187 = load i32, i32* %14, align 4 | |
switch i32 %187, label %192 [ | |
i32 0, label %188 | |
] | |
; <label>:188: ; preds = %185 | |
br label %189 | |
; <label>:189: ; preds = %188 | |
%190 = load i32, i32* %20, align 4 | |
%191 = add nsw i32 %190, 16 | |
store i32 %191, i32* %20, align 4 | |
br label %104, !llvm.loop !59 | |
; <label>:192: ; preds = %185, %107 | |
%193 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %193) #9 | |
br label %194 | |
; <label>:194: ; preds = %192 | |
br label %195 | |
; <label>:195: ; preds = %194, %80 | |
%196 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %196) #9 | |
store i32 16, i32* %26, align 4 | |
br label %197 | |
; <label>:197: ; preds = %208, %195 | |
%198 = load i32, i32* %26, align 4 | |
%199 = icmp sgt i32 %198, 0 | |
br i1 %199, label %202, label %200 | |
; <label>:200: ; preds = %197 | |
store i32 14, i32* %14, align 4 | |
%201 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %201) #9 | |
br label %211 | |
; <label>:202: ; preds = %197 | |
%203 = load float, float* %19, align 4 | |
%204 = load i32, i32* %26, align 4 | |
%205 = call float @_ZL11__shfl_downfji(float %203, i32 %204, i32 32) #10 | |
%206 = load float, float* %19, align 4 | |
%207 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %205, float %206) #10 | |
store float %207, float* %19, align 4 | |
br label %208 | |
; <label>:208: ; preds = %202 | |
%209 = load i32, i32* %26, align 4 | |
%210 = sdiv i32 %209, 2 | |
store i32 %210, i32* %26, align 4 | |
br label %197, !llvm.loop !60 | |
; <label>:211: ; preds = %200 | |
%212 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %212) #9 | |
%213 = load i32, i32* %12, align 4 | |
%214 = and i32 %213, 31 | |
store i32 %214, i32* %27, align 4 | |
%215 = load i32, i32* %27, align 4 | |
%216 = icmp eq i32 %215, 0 | |
br i1 %216, label %217, label %225 | |
; <label>:217: ; preds = %211 | |
%218 = load i32, i32* %18, align 4 | |
%219 = load i32, i32* %6, align 4 | |
%220 = icmp slt i32 %218, %219 | |
br i1 %220, label %221, label %225 | |
; <label>:221: ; preds = %217 | |
%222 = load i32, i32* %18, align 4 | |
%223 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %222) #10 | |
%224 = load float, float* %19, align 4 | |
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %223, float %224) #10 | |
br label %225 | |
; <label>:225: ; preds = %221, %217, %211 | |
%226 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %226) #9 | |
%227 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %227) #9 | |
%228 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %228) #9 | |
%229 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %229) #9 | |
%230 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %230) #9 | |
%231 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %231) #9 | |
br label %232 | |
; <label>:232: ; preds = %225 | |
%233 = load i32, i32* %13, align 4 | |
%234 = add nsw i32 %233, 32 | |
store i32 %234, i32* %13, align 4 | |
br label %74 | |
; <label>:235: ; preds = %78 | |
%236 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %236) #9 | |
%237 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %237) #9 | |
%238 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %238) #9 | |
%239 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %239) #9 | |
%240 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %240) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define internal float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*) #0 align 2 { | |
%2 = alloca %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, align 8 | |
store %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"** %2, align 8 | |
%3 = load %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"** %2, align 8 | |
%4 = getelementptr inbounds %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %3, i32 0, i32 0 | |
%5 = load float, float* %4, align 4 | |
ret float %5 | |
} | |
; Function Attrs: convergent nounwind | |
define internal float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, float, float) #0 align 2 { | |
%4 = alloca %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, align 8 | |
%5 = alloca float, align 4 | |
%6 = alloca float, align 4 | |
store %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"** %4, align 8 | |
store float %1, float* %5, align 4 | |
store float %2, float* %6, align 4 | |
%7 = load %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"** %4, align 8 | |
%8 = load float, float* %5, align 4 | |
%9 = load float, float* %6, align 4 | |
%10 = call float @_ZL4fmaxff(float %8, float %9) #10 | |
ret float %10 | |
} | |
; Function Attrs: convergent nounwind | |
define internal void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, float*, float) #0 align 2 { | |
%4 = alloca %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, align 8 | |
%5 = alloca float*, align 8 | |
%6 = alloca float, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32 | |
store %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"** %4, align 8 | |
store float* %1, float** %5, align 8 | |
store float %2, float* %6, align 4 | |
%10 = load %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"** %4, align 8 | |
%11 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %11) #9 | |
%12 = load float*, float** %5, align 8 | |
%13 = bitcast float* %12 to i32* | |
%14 = load i32, i32* %13, align 4 | |
store i32 %14, i32* %7, align 4 | |
br label %15 | |
; <label>:15: ; preds = %37, %3 | |
%16 = bitcast i32* %7 to float* | |
%17 = load float, float* %16, align 4 | |
%18 = load float, float* %6, align 4 | |
%19 = fcmp olt float %17, %18 | |
br i1 %19, label %20, label %38 | |
; <label>:20: ; preds = %15 | |
%21 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %21) #9 | |
%22 = load float*, float** %5, align 8 | |
%23 = bitcast float* %22 to i32* | |
%24 = load i32, i32* %7, align 4 | |
%25 = bitcast float* %6 to i32* | |
%26 = load i32, i32* %25, align 4 | |
%27 = call i32 @_ZL9atomicCASPjjj(i32* %23, i32 %24, i32 %26) #10 | |
store i32 %27, i32* %8, align 4 | |
%28 = load i32, i32* %7, align 4 | |
%29 = load i32, i32* %8, align 4 | |
%30 = icmp eq i32 %28, %29 | |
br i1 %30, label %31, label %32 | |
; <label>:31: ; preds = %20 | |
store i32 3, i32* %9, align 4 | |
br label %34 | |
; <label>:32: ; preds = %20 | |
%33 = load i32, i32* %8, align 4 | |
store i32 %33, i32* %7, align 4 | |
store i32 0, i32* %9, align 4 | |
br label %34 | |
; <label>:34: ; preds = %32, %31 | |
%35 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %35) #9 | |
%36 = load i32, i32* %9, align 4 | |
switch i32 %36, label %40 [ | |
i32 0, label %37 | |
i32 3, label %38 | |
] | |
; <label>:37: ; preds = %34 | |
br label %15 | |
; <label>:38: ; preds = %34, %15 | |
%39 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %39) #9 | |
ret void | |
; <label>:40: ; preds = %34 | |
unreachable | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define internal float @_ZL4fmaxff(float, float) #2 { | |
%3 = alloca float, align 4 | |
%4 = alloca float, align 4 | |
store float %0, float* %3, align 4 | |
store float %1, float* %4, align 4 | |
%5 = load float, float* %3, align 4 | |
%6 = load float, float* %4, align 4 | |
%7 = call float @_ZL5fmaxfff(float %5, float %6) #10 | |
ret float %7 | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define internal float @_ZL5fmaxfff(float, float) #2 { | |
%3 = alloca float, align 4 | |
%4 = alloca float, align 4 | |
store float %0, float* %3, align 4 | |
store float %1, float* %4, align 4 | |
%5 = load float, float* %3, align 4 | |
%6 = load float, float* %4, align 4 | |
%7 = call float @__nv_fmaxf(float %5, float %6) #10 | |
ret float %7 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define internal i32 @_ZL9atomicCASPjjj(i32*, i32, i32) #4 { | |
%4 = alloca i32*, align 8 | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
store i32* %0, i32** %4, align 8 | |
store i32 %1, i32* %5, align 4 | |
store i32 %2, i32* %6, align 4 | |
%7 = load i32*, i32** %4, align 8 | |
%8 = load i32, i32* %5, align 4 | |
%9 = load i32, i32* %6, align 4 | |
%10 = call i32 @_ZL12__uAtomicCASPjjj(i32* %7, i32 %8, i32 %9) #10 | |
ret i32 %10 | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define internal i32 @_ZL12__uAtomicCASPjjj(i32*, i32, i32) #2 { | |
%4 = alloca i32*, align 8 | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
store i32* %0, i32** %4, align 8 | |
store i32 %1, i32* %5, align 4 | |
store i32 %2, i32* %6, align 4 | |
%7 = load i32*, i32** %4, align 8 | |
%8 = load i32, i32* %5, align 4 | |
%9 = load i32, i32* %6, align 4 | |
%10 = cmpxchg i32* %7, i32 %8, i32 %9 seq_cst seq_cst | |
%11 = extractvalue { i32, i1 } %10, 0 | |
ret i32 %11 | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32, align 4 | |
%15 = alloca float, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca float, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%19 = icmp eq i32 %18, 256 | |
br i1 %19, label %20, label %21 | |
; <label>:20: ; preds = %5 | |
br label %22 | |
; <label>:21: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %22 | |
; <label>:22: ; preds = %21, %20 | |
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%24 = icmp eq i32 %23, 1 | |
br i1 %24, label %25, label %26 | |
; <label>:25: ; preds = %22 | |
br label %27 | |
; <label>:26: ; preds = %22 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %27 | |
; <label>:27: ; preds = %26, %25 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%29 = icmp eq i32 %28, 1 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %27 | |
br label %32 | |
; <label>:31: ; preds = %27 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%34 = icmp eq i32 %33, 128 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%44 = icmp eq i32 %43, 1 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %48) #9 | |
%49 = load i32, i32* %6, align 4 | |
%50 = add nsw i32 %49, 16 | |
%51 = sub nsw i32 %50, 1 | |
%52 = sdiv i32 %51, 16 | |
%53 = load i32, i32* %7, align 4 | |
%54 = mul nsw i32 %52, %53 | |
store i32 %54, i32* %8, align 4 | |
%55 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %55) #9 | |
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %56, i32* %9, align 4 | |
%57 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %57) #9 | |
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %58, i32* %10, align 4 | |
%59 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %9, align 4 | |
%61 = mul nsw i32 %60, 256 | |
%62 = load i32, i32* %10, align 4 | |
%63 = add nsw i32 %61, %62 | |
store i32 %63, i32* %11, align 4 | |
br label %64 | |
; <label>:64: ; preds = %135, %47 | |
%65 = load i32, i32* %11, align 4 | |
%66 = load i32, i32* %8, align 4 | |
%67 = icmp slt i32 %65, %66 | |
br i1 %67, label %70, label %68 | |
; <label>:68: ; preds = %64 | |
store i32 2, i32* %12, align 4 | |
%69 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %69) #9 | |
br label %138 | |
; <label>:70: ; preds = %64 | |
%71 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %71) #9 | |
%72 = load i32, i32* %11, align 4 | |
%73 = load i32, i32* %7, align 4 | |
%74 = srem i32 %72, %73 | |
store i32 %74, i32* %13, align 4 | |
%75 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %75) #9 | |
%76 = load i32, i32* %11, align 4 | |
%77 = load i32, i32* %7, align 4 | |
%78 = sdiv i32 %76, %77 | |
%79 = load i32, i32* %6, align 4 | |
%80 = add nsw i32 %79, 16 | |
%81 = sub nsw i32 %80, 1 | |
%82 = sdiv i32 %81, 16 | |
%83 = srem i32 %78, %82 | |
%84 = mul nsw i32 %83, 16 | |
store i32 %84, i32* %14, align 4 | |
%85 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%87 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %86) #10 | |
store float %87, float* %15, align 4 | |
%88 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %88) #9 | |
store i32 0, i32* %16, align 4 | |
br label %89 | |
; <label>:89: ; preds = %124, %70 | |
%90 = load i32, i32* %16, align 4 | |
%91 = icmp slt i32 %90, 16 | |
br i1 %91, label %94, label %92 | |
; <label>:92: ; preds = %89 | |
store i32 5, i32* %12, align 4 | |
%93 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %93) #9 | |
br label %127 | |
; <label>:94: ; preds = %89 | |
%95 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %95) #9 | |
%96 = load i32, i32* %13, align 4 | |
%97 = load i32, i32* %7, align 4 | |
%98 = icmp slt i32 %96, %97 | |
br i1 %98, label %99, label %114 | |
; <label>:99: ; preds = %94 | |
%100 = load i32, i32* %14, align 4 | |
%101 = load i32, i32* %16, align 4 | |
%102 = add nsw i32 %100, %101 | |
%103 = load i32, i32* %6, align 4 | |
%104 = icmp slt i32 %102, %103 | |
br i1 %104, label %105, label %114 | |
; <label>:105: ; preds = %99 | |
%106 = load i32, i32* %14, align 4 | |
%107 = load i32, i32* %16, align 4 | |
%108 = add nsw i32 %106, %107 | |
%109 = load i32, i32* %7, align 4 | |
%110 = mul nsw i32 %108, %109 | |
%111 = load i32, i32* %13, align 4 | |
%112 = add nsw i32 %110, %111 | |
%113 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %112) #10 | |
br label %117 | |
; <label>:114: ; preds = %99, %94 | |
%115 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%116 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %115) #10 | |
br label %117 | |
; <label>:117: ; preds = %114, %105 | |
%118 = phi float [ %113, %105 ], [ %116, %114 ] | |
store float %118, float* %17, align 4 | |
%119 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%120 = load float, float* %15, align 4 | |
%121 = load float, float* %17, align 4 | |
%122 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %119, float %120, float %121) #10 | |
store float %122, float* %15, align 4 | |
%123 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %123) #9 | |
br label %124 | |
; <label>:124: ; preds = %117 | |
%125 = load i32, i32* %16, align 4 | |
%126 = add nsw i32 %125, 1 | |
store i32 %126, i32* %16, align 4 | |
br label %89 | |
; <label>:127: ; preds = %92 | |
%128 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%129 = load i32, i32* %13, align 4 | |
%130 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %129) #10 | |
%131 = load float, float* %15, align 4 | |
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %128, float* %130, float %131) #10 | |
%132 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %132) #9 | |
%133 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %133) #9 | |
%134 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %134) #9 | |
br label %135 | |
; <label>:135: ; preds = %127 | |
%136 = load i32, i32* %11, align 4 | |
%137 = add nsw i32 %136, 32768 | |
store i32 %137, i32* %11, align 4 | |
br label %64 | |
; <label>:138: ; preds = %68 | |
%139 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %139) #9 | |
%140 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %140) #9 | |
%141 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %141) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32, align 4 | |
%15 = alloca float, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca float, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%19 = icmp eq i32 %18, 256 | |
br i1 %19, label %20, label %21 | |
; <label>:20: ; preds = %5 | |
br label %22 | |
; <label>:21: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %22 | |
; <label>:22: ; preds = %21, %20 | |
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%24 = icmp eq i32 %23, 1 | |
br i1 %24, label %25, label %26 | |
; <label>:25: ; preds = %22 | |
br label %27 | |
; <label>:26: ; preds = %22 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %27 | |
; <label>:27: ; preds = %26, %25 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%29 = icmp eq i32 %28, 1 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %27 | |
br label %32 | |
; <label>:31: ; preds = %27 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%34 = icmp eq i32 %33, 128 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%44 = icmp eq i32 %43, 1 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %48) #9 | |
%49 = load i32, i32* %6, align 4 | |
%50 = add nsw i32 %49, 16 | |
%51 = sub nsw i32 %50, 1 | |
%52 = sdiv i32 %51, 16 | |
%53 = load i32, i32* %7, align 4 | |
%54 = mul nsw i32 %52, %53 | |
store i32 %54, i32* %8, align 4 | |
%55 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %55) #9 | |
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %56, i32* %9, align 4 | |
%57 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %57) #9 | |
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %58, i32* %10, align 4 | |
%59 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %9, align 4 | |
%61 = mul nsw i32 %60, 256 | |
%62 = load i32, i32* %10, align 4 | |
%63 = add nsw i32 %61, %62 | |
store i32 %63, i32* %11, align 4 | |
br label %64 | |
; <label>:64: ; preds = %131, %47 | |
%65 = load i32, i32* %11, align 4 | |
%66 = load i32, i32* %8, align 4 | |
%67 = icmp slt i32 %65, %66 | |
br i1 %67, label %70, label %68 | |
; <label>:68: ; preds = %64 | |
store i32 2, i32* %12, align 4 | |
%69 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %69) #9 | |
br label %134 | |
; <label>:70: ; preds = %64 | |
%71 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %71) #9 | |
%72 = load i32, i32* %11, align 4 | |
%73 = load i32, i32* %7, align 4 | |
%74 = srem i32 %72, %73 | |
store i32 %74, i32* %13, align 4 | |
%75 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %75) #9 | |
%76 = load i32, i32* %11, align 4 | |
%77 = load i32, i32* %7, align 4 | |
%78 = sdiv i32 %76, %77 | |
%79 = load i32, i32* %6, align 4 | |
%80 = add nsw i32 %79, 16 | |
%81 = sub nsw i32 %80, 1 | |
%82 = sdiv i32 %81, 16 | |
%83 = srem i32 %78, %82 | |
%84 = mul nsw i32 %83, 16 | |
store i32 %84, i32* %14, align 4 | |
%85 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10 | |
store float %86, float* %15, align 4 | |
%87 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %87) #9 | |
store i32 0, i32* %16, align 4 | |
br label %88 | |
; <label>:88: ; preds = %121, %70 | |
%89 = load i32, i32* %16, align 4 | |
%90 = icmp slt i32 %89, 16 | |
br i1 %90, label %93, label %91 | |
; <label>:91: ; preds = %88 | |
store i32 5, i32* %12, align 4 | |
%92 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %92) #9 | |
br label %124 | |
; <label>:93: ; preds = %88 | |
%94 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %94) #9 | |
%95 = load i32, i32* %13, align 4 | |
%96 = load i32, i32* %7, align 4 | |
%97 = icmp slt i32 %95, %96 | |
br i1 %97, label %98, label %113 | |
; <label>:98: ; preds = %93 | |
%99 = load i32, i32* %14, align 4 | |
%100 = load i32, i32* %16, align 4 | |
%101 = add nsw i32 %99, %100 | |
%102 = load i32, i32* %6, align 4 | |
%103 = icmp slt i32 %101, %102 | |
br i1 %103, label %104, label %113 | |
; <label>:104: ; preds = %98 | |
%105 = load i32, i32* %14, align 4 | |
%106 = load i32, i32* %16, align 4 | |
%107 = add nsw i32 %105, %106 | |
%108 = load i32, i32* %7, align 4 | |
%109 = mul nsw i32 %107, %108 | |
%110 = load i32, i32* %13, align 4 | |
%111 = add nsw i32 %109, %110 | |
%112 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %111) #10 | |
br label %115 | |
; <label>:113: ; preds = %98, %93 | |
%114 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10 | |
br label %115 | |
; <label>:115: ; preds = %113, %104 | |
%116 = phi float [ %112, %104 ], [ %114, %113 ] | |
store float %116, float* %17, align 4 | |
%117 = load float, float* %15, align 4 | |
%118 = load float, float* %17, align 4 | |
%119 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %117, float %118) #10 | |
store float %119, float* %15, align 4 | |
%120 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %120) #9 | |
br label %121 | |
; <label>:121: ; preds = %115 | |
%122 = load i32, i32* %16, align 4 | |
%123 = add nsw i32 %122, 1 | |
store i32 %123, i32* %16, align 4 | |
br label %88 | |
; <label>:124: ; preds = %91 | |
%125 = load i32, i32* %13, align 4 | |
%126 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %125) #10 | |
%127 = load float, float* %15, align 4 | |
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %126, float %127) #10 | |
%128 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %128) #9 | |
%129 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %129) #9 | |
%130 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %130) #9 | |
br label %131 | |
; <label>:131: ; preds = %124 | |
%132 = load i32, i32* %11, align 4 | |
%133 = add nsw i32 %132, 32768 | |
store i32 %133, i32* %11, align 4 | |
br label %64 | |
; <label>:134: ; preds = %68 | |
%135 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %135) #9 | |
%136 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %136) #9 | |
%137 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %137) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.6"* byval align 8, i32) #0 comdat { | |
%3 = alloca i32, align 4 | |
%4 = alloca i32, align 4 | |
%5 = alloca i32, align 4 | |
%6 = alloca %"struct.Eigen::TensorEvaluator.6", align 8 | |
%7 = alloca i8, align 1 | |
%8 = alloca %"struct.Eigen::TensorEvaluator.6", align 8 | |
store i32 %1, i32* %3, align 4 | |
%9 = bitcast i32* %4 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
%10 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%11 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%12 = mul i32 %10, %11 | |
%13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%14 = add i32 %12, %13 | |
store i32 %14, i32* %4, align 4 | |
%15 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %15) #9 | |
%16 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%17 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%18 = mul i32 %16, %17 | |
store i32 %18, i32* %5, align 4 | |
%19 = bitcast %"struct.Eigen::TensorEvaluator.6"* %6 to i8* | |
call void @llvm.lifetime.start(i64 168, i8* %19) #9 | |
%20 = bitcast %"struct.Eigen::TensorEvaluator.6"* %6 to i8* | |
%21 = bitcast %"struct.Eigen::TensorEvaluator.6"* %0 to i8* | |
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %20, i8* %21, i64 168, i32 8, i1 false) | |
call void @llvm.lifetime.start(i64 1, i8* %7) #9 | |
store i8 0, i8* %7, align 1 | |
%22 = bitcast %"struct.Eigen::TensorEvaluator.6"* %8 to i8* | |
%23 = bitcast %"struct.Eigen::TensorEvaluator.6"* %6 to i8* | |
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %22, i8* %23, i64 168, i32 8, i1 false) | |
%24 = load i32, i32* %4, align 4 | |
%25 = load i32, i32* %3, align 4 | |
%26 = load i32, i32* %5, align 4 | |
call void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii(%"struct.Eigen::TensorEvaluator.6"* byval align 8 %8, i32 %24, i32 %25, i32 %26) #10 | |
call void @llvm.lifetime.end(i64 1, i8* %7) #9 | |
%27 = bitcast %"struct.Eigen::TensorEvaluator.6"* %6 to i8* | |
call void @llvm.lifetime.end(i64 168, i8* %27) #9 | |
%28 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %28) #9 | |
%29 = bitcast i32* %4 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %29) #9 | |
ret void | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii(%"struct.Eigen::TensorEvaluator.6"* byval align 8, i32, i32, i32) #2 comdat align 2 { | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
store i32 %1, i32* %5, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%9 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
%10 = load i32, i32* %5, align 4 | |
store i32 %10, i32* %8, align 4 | |
br label %11 | |
; <label>:11: ; preds = %19, %4 | |
%12 = load i32, i32* %8, align 4 | |
%13 = load i32, i32* %6, align 4 | |
%14 = icmp slt i32 %12, %13 | |
br i1 %14, label %17, label %15 | |
; <label>:15: ; preds = %11 | |
%16 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %16) #9 | |
br label %23 | |
; <label>:17: ; preds = %11 | |
%18 = load i32, i32* %8, align 4 | |
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.6"* %0, i32 %18) #10 | |
br label %19 | |
; <label>:19: ; preds = %17 | |
%20 = load i32, i32* %7, align 4 | |
%21 = load i32, i32* %8, align 4 | |
%22 = add nsw i32 %21, %20 | |
store i32 %22, i32* %8, align 4 | |
br label %11 | |
; <label>:23: ; preds = %15 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.6"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator.6"*, align 8 | |
%4 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator.6"* %0, %"struct.Eigen::TensorEvaluator.6"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%5 = load %"struct.Eigen::TensorEvaluator.6"*, %"struct.Eigen::TensorEvaluator.6"** %3, align 8 | |
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.6", %"struct.Eigen::TensorEvaluator.6"* %5, i32 0, i32 1 | |
%7 = load i32, i32* %4, align 4 | |
%8 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.8"* %6, i32 %7) #10 | |
%9 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.6", %"struct.Eigen::TensorEvaluator.6"* %5, i32 0, i32 0 | |
%10 = load i32, i32* %4, align 4 | |
%11 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"* %9, i32 %10) #10 | |
store float %8, float* %11, align 4 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.8"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator.8"*, align 8 | |
%4 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator.8"* %0, %"struct.Eigen::TensorEvaluator.8"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%5 = load %"struct.Eigen::TensorEvaluator.8"*, %"struct.Eigen::TensorEvaluator.8"** %3, align 8 | |
%6 = load i32, i32* %4, align 4 | |
%7 = sext i32 %6 to i64 | |
%8 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.8", %"struct.Eigen::TensorEvaluator.8"* %5, i32 0, i32 3 | |
%9 = load float*, float** %8, align 8 | |
%10 = getelementptr inbounds float, float* %9, i64 %7 | |
%11 = load float, float* %10, align 4 | |
ret float %11 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator.7"*, align 8 | |
%4 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator.7"* %0, %"struct.Eigen::TensorEvaluator.7"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%5 = load %"struct.Eigen::TensorEvaluator.7"*, %"struct.Eigen::TensorEvaluator.7"** %3, align 8 | |
%6 = load i32, i32* %4, align 4 | |
%7 = sext i32 %6 to i64 | |
%8 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %5, i32 0, i32 0 | |
%9 = load float*, float** %8, align 8 | |
%10 = getelementptr inbounds float, float* %9, i64 %7 | |
ret float* %10 | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.11"* byval align 8, i32) #0 comdat { | |
%3 = alloca i32, align 4 | |
%4 = alloca i32, align 4 | |
%5 = alloca i32, align 4 | |
%6 = alloca %"struct.Eigen::TensorEvaluator.11", align 8 | |
%7 = alloca i8, align 1 | |
%8 = alloca %"struct.Eigen::TensorEvaluator.11", align 8 | |
store i32 %1, i32* %3, align 4 | |
%9 = bitcast i32* %4 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
%10 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%11 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%12 = mul i32 %10, %11 | |
%13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%14 = add i32 %12, %13 | |
store i32 %14, i32* %4, align 4 | |
%15 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %15) #9 | |
%16 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%17 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%18 = mul i32 %16, %17 | |
store i32 %18, i32* %5, align 4 | |
%19 = bitcast %"struct.Eigen::TensorEvaluator.11"* %6 to i8* | |
call void @llvm.lifetime.start(i64 136, i8* %19) #9 | |
%20 = bitcast %"struct.Eigen::TensorEvaluator.11"* %6 to i8* | |
%21 = bitcast %"struct.Eigen::TensorEvaluator.11"* %0 to i8* | |
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %20, i8* %21, i64 136, i32 8, i1 false) | |
call void @llvm.lifetime.start(i64 1, i8* %7) #9 | |
store i8 0, i8* %7, align 1 | |
%22 = bitcast %"struct.Eigen::TensorEvaluator.11"* %8 to i8* | |
%23 = bitcast %"struct.Eigen::TensorEvaluator.11"* %6 to i8* | |
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %22, i8* %23, i64 136, i32 8, i1 false) | |
%24 = load i32, i32* %4, align 4 | |
%25 = load i32, i32* %3, align 4 | |
%26 = load i32, i32* %5, align 4 | |
call void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii(%"struct.Eigen::TensorEvaluator.11"* byval align 8 %8, i32 %24, i32 %25, i32 %26) #10 | |
call void @llvm.lifetime.end(i64 1, i8* %7) #9 | |
%27 = bitcast %"struct.Eigen::TensorEvaluator.11"* %6 to i8* | |
call void @llvm.lifetime.end(i64 136, i8* %27) #9 | |
%28 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %28) #9 | |
%29 = bitcast i32* %4 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %29) #9 | |
ret void | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii(%"struct.Eigen::TensorEvaluator.11"* byval align 8, i32, i32, i32) #2 comdat align 2 { | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
store i32 %1, i32* %5, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%9 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
%10 = load i32, i32* %5, align 4 | |
store i32 %10, i32* %8, align 4 | |
br label %11 | |
; <label>:11: ; preds = %19, %4 | |
%12 = load i32, i32* %8, align 4 | |
%13 = load i32, i32* %6, align 4 | |
%14 = icmp slt i32 %12, %13 | |
br i1 %14, label %17, label %15 | |
; <label>:15: ; preds = %11 | |
%16 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %16) #9 | |
br label %23 | |
; <label>:17: ; preds = %11 | |
%18 = load i32, i32* %8, align 4 | |
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.11"* %0, i32 %18) #10 | |
br label %19 | |
; <label>:19: ; preds = %17 | |
%20 = load i32, i32* %7, align 4 | |
%21 = load i32, i32* %8, align 4 | |
%22 = add nsw i32 %21, %20 | |
store i32 %22, i32* %8, align 4 | |
br label %11 | |
; <label>:23: ; preds = %15 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.11"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator.11"*, align 8 | |
%4 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator.11"* %0, %"struct.Eigen::TensorEvaluator.11"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%5 = load %"struct.Eigen::TensorEvaluator.11"*, %"struct.Eigen::TensorEvaluator.11"** %3, align 8 | |
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %5, i32 0, i32 1 | |
%7 = load i32, i32* %4, align 4 | |
%8 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator"* %6, i32 %7) #10 | |
%9 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %5, i32 0, i32 0 | |
%10 = load i32, i32* %4, align 4 | |
%11 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"* %9, i32 %10) #10 | |
store float %8, float* %11, align 4 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_(float, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #0 comdat { | |
%4 = alloca float, align 4 | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
store float %0, float* %4, align 4 | |
store i32 %1, i32* %5, align 4 | |
%8 = bitcast i32* %6 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %8) #9 | |
%9 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%10 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%11 = mul i32 %9, %10 | |
%12 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%13 = add i32 %11, %12 | |
store i32 %13, i32* %6, align 4 | |
%14 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %14) #9 | |
%15 = load i32, i32* %6, align 4 | |
store i32 %15, i32* %7, align 4 | |
br label %16 | |
; <label>:16: ; preds = %26, %3 | |
%17 = load i32, i32* %7, align 4 | |
%18 = load i32, i32* %5, align 4 | |
%19 = icmp slt i32 %17, %18 | |
br i1 %19, label %22, label %20 | |
; <label>:20: ; preds = %16 | |
%21 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %21) #9 | |
br label %32 | |
; <label>:22: ; preds = %16 | |
%23 = load float, float* %4, align 4 | |
%24 = load i32, i32* %7, align 4 | |
%25 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"* %2, i32 %24) #10 | |
store float %23, float* %25, align 4 | |
br label %26 | |
; <label>:26: ; preds = %22 | |
%27 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%29 = mul i32 %27, %28 | |
%30 = load i32, i32* %7, align 4 | |
%31 = add i32 %30, %29 | |
store i32 %31, i32* %7, align 4 | |
br label %16 | |
; <label>:32: ; preds = %20 | |
%33 = bitcast i32* %6 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %33) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32, align 4 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32 | |
%15 = alloca i32, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca i32, align 4 | |
%18 = alloca i32, align 4 | |
%19 = alloca float, align 4 | |
%20 = alloca i32, align 4 | |
%21 = alloca i32, align 4 | |
%22 = alloca i32, align 4 | |
%23 = alloca i32, align 4 | |
%24 = alloca i32, align 4 | |
%25 = alloca i32, align 4 | |
%26 = alloca i32, align 4 | |
%27 = alloca i32, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%29 = icmp eq i32 %28, 256 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %5 | |
br label %32 | |
; <label>:31: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%34 = icmp eq i32 %33, 1 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%44 = icmp eq i32 %43, 32 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%49 = icmp eq i32 %48, 1 | |
br i1 %49, label %50, label %51 | |
; <label>:50: ; preds = %47 | |
br label %52 | |
; <label>:51: ; preds = %47 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %52 | |
; <label>:52: ; preds = %51, %50 | |
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%54 = icmp eq i32 %53, 1 | |
br i1 %54, label %55, label %56 | |
; <label>:55: ; preds = %52 | |
br label %57 | |
; <label>:56: ; preds = %52 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %57 | |
; <label>:57: ; preds = %56, %55 | |
%58 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %58) #9 | |
store i32 16, i32* %8, align 4 | |
%59 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %7, align 4 | |
%61 = add nsw i32 %60, 32768 | |
%62 = sub nsw i32 %61, 1 | |
%63 = sdiv i32 %62, 32768 | |
store i32 %63, i32* %9, align 4 | |
%64 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %64) #9 | |
%65 = load i32, i32* %9, align 4 | |
%66 = load i32, i32* %6, align 4 | |
%67 = mul nsw i32 %65, %66 | |
store i32 %67, i32* %10, align 4 | |
%68 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %68) #9 | |
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %69, i32* %11, align 4 | |
%70 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %70) #9 | |
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %71, i32* %12, align 4 | |
%72 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %72) #9 | |
%73 = load i32, i32* %11, align 4 | |
store i32 %73, i32* %13, align 4 | |
br label %74 | |
; <label>:74: ; preds = %237, %57 | |
%75 = load i32, i32* %13, align 4 | |
%76 = load i32, i32* %10, align 4 | |
%77 = icmp slt i32 %75, %76 | |
br i1 %77, label %80, label %78 | |
; <label>:78: ; preds = %74 | |
store i32 2, i32* %14, align 4 | |
%79 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %79) #9 | |
br label %240 | |
; <label>:80: ; preds = %74 | |
%81 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %81) #9 | |
%82 = load i32, i32* %13, align 4 | |
%83 = load i32, i32* %9, align 4 | |
%84 = srem i32 %82, %83 | |
store i32 %84, i32* %15, align 4 | |
%85 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = load i32, i32* %13, align 4 | |
%87 = load i32, i32* %9, align 4 | |
%88 = sdiv i32 %86, %87 | |
store i32 %88, i32* %16, align 4 | |
%89 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %89) #9 | |
%90 = load i32, i32* %15, align 4 | |
%91 = mul nsw i32 %90, 256 | |
%92 = mul nsw i32 %91, 128 | |
%93 = load i32, i32* %12, align 4 | |
%94 = add nsw i32 %92, %93 | |
store i32 %94, i32* %17, align 4 | |
%95 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %95) #9 | |
%96 = load i32, i32* %16, align 4 | |
store i32 %96, i32* %18, align 4 | |
%97 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %97) #9 | |
%98 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%99 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %98) #10 | |
store float %99, float* %19, align 4 | |
%100 = load i32, i32* %18, align 4 | |
%101 = load i32, i32* %6, align 4 | |
%102 = icmp slt i32 %100, %101 | |
br i1 %102, label %103, label %198 | |
; <label>:103: ; preds = %80 | |
%104 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %104) #9 | |
store i32 0, i32* %20, align 4 | |
br label %105 | |
; <label>:105: ; preds = %192, %103 | |
%106 = load i32, i32* %20, align 4 | |
%107 = icmp slt i32 %106, 128 | |
br i1 %107, label %109, label %108 | |
; <label>:108: ; preds = %105 | |
store i32 5, i32* %14, align 4 | |
br label %195 | |
; <label>:109: ; preds = %105 | |
%110 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %110) #9 | |
%111 = load i32, i32* %17, align 4 | |
%112 = load i32, i32* %20, align 4 | |
%113 = add nsw i32 %112, 16 | |
%114 = sub nsw i32 %113, 1 | |
%115 = mul nsw i32 256, %114 | |
%116 = add nsw i32 %111, %115 | |
store i32 %116, i32* %21, align 4 | |
%117 = load i32, i32* %21, align 4 | |
%118 = load i32, i32* %7, align 4 | |
%119 = icmp sge i32 %117, %118 | |
br i1 %119, label %120, label %158 | |
; <label>:120: ; preds = %109 | |
%121 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %121) #9 | |
store i32 0, i32* %22, align 4 | |
br label %122 | |
; <label>:122: ; preds = %152, %120 | |
%123 = load i32, i32* %22, align 4 | |
%124 = icmp slt i32 %123, 15 | |
br i1 %124, label %126, label %125 | |
; <label>:125: ; preds = %122 | |
store i32 8, i32* %14, align 4 | |
br label %155 | |
; <label>:126: ; preds = %122 | |
%127 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %127) #9 | |
%128 = load i32, i32* %17, align 4 | |
%129 = load i32, i32* %20, align 4 | |
%130 = load i32, i32* %22, align 4 | |
%131 = add nsw i32 %129, %130 | |
%132 = mul nsw i32 256, %131 | |
%133 = add nsw i32 %128, %132 | |
store i32 %133, i32* %23, align 4 | |
%134 = load i32, i32* %23, align 4 | |
%135 = load i32, i32* %7, align 4 | |
%136 = icmp sge i32 %134, %135 | |
br i1 %136, label %137, label %138 | |
; <label>:137: ; preds = %126 | |
store i32 8, i32* %14, align 4 | |
br label %148 | |
; <label>:138: ; preds = %126 | |
%139 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%140 = load float, float* %19, align 4 | |
%141 = load i32, i32* %18, align 4 | |
%142 = load i32, i32* %7, align 4 | |
%143 = mul nsw i32 %141, %142 | |
%144 = load i32, i32* %23, align 4 | |
%145 = add nsw i32 %143, %144 | |
%146 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %145) #10 | |
%147 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %139, float %140, float %146) #10 | |
store float %147, float* %19, align 4 | |
store i32 0, i32* %14, align 4 | |
br label %148 | |
; <label>:148: ; preds = %138, %137 | |
%149 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %149) #9 | |
%150 = load i32, i32* %14, align 4 | |
switch i32 %150, label %155 [ | |
i32 0, label %151 | |
] | |
; <label>:151: ; preds = %148 | |
br label %152 | |
; <label>:152: ; preds = %151 | |
%153 = load i32, i32* %22, align 4 | |
%154 = add nsw i32 %153, 1 | |
store i32 %154, i32* %22, align 4 | |
br label %122, !llvm.loop !61 | |
; <label>:155: ; preds = %148, %125 | |
%156 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %156) #9 | |
br label %157 | |
; <label>:157: ; preds = %155 | |
store i32 5, i32* %14, align 4 | |
br label %188 | |
; <label>:158: ; preds = %109 | |
%159 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %159) #9 | |
store i32 0, i32* %24, align 4 | |
br label %160 | |
; <label>:160: ; preds = %183, %158 | |
%161 = load i32, i32* %24, align 4 | |
%162 = icmp slt i32 %161, 16 | |
br i1 %162, label %165, label %163 | |
; <label>:163: ; preds = %160 | |
store i32 11, i32* %14, align 4 | |
%164 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %164) #9 | |
br label %186 | |
; <label>:165: ; preds = %160 | |
%166 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %166) #9 | |
%167 = load i32, i32* %17, align 4 | |
%168 = load i32, i32* %20, align 4 | |
%169 = load i32, i32* %24, align 4 | |
%170 = add nsw i32 %168, %169 | |
%171 = mul nsw i32 256, %170 | |
%172 = add nsw i32 %167, %171 | |
store i32 %172, i32* %25, align 4 | |
%173 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%174 = load float, float* %19, align 4 | |
%175 = load i32, i32* %18, align 4 | |
%176 = load i32, i32* %7, align 4 | |
%177 = mul nsw i32 %175, %176 | |
%178 = load i32, i32* %25, align 4 | |
%179 = add nsw i32 %177, %178 | |
%180 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %179) #10 | |
%181 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %173, float %174, float %180) #10 | |
store float %181, float* %19, align 4 | |
%182 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %182) #9 | |
br label %183 | |
; <label>:183: ; preds = %165 | |
%184 = load i32, i32* %24, align 4 | |
%185 = add nsw i32 %184, 1 | |
store i32 %185, i32* %24, align 4 | |
br label %160, !llvm.loop !62 | |
; <label>:186: ; preds = %163 | |
br label %187 | |
; <label>:187: ; preds = %186 | |
store i32 0, i32* %14, align 4 | |
br label %188 | |
; <label>:188: ; preds = %187, %157 | |
%189 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %189) #9 | |
%190 = load i32, i32* %14, align 4 | |
switch i32 %190, label %195 [ | |
i32 0, label %191 | |
] | |
; <label>:191: ; preds = %188 | |
br label %192 | |
; <label>:192: ; preds = %191 | |
%193 = load i32, i32* %20, align 4 | |
%194 = add nsw i32 %193, 16 | |
store i32 %194, i32* %20, align 4 | |
br label %105, !llvm.loop !63 | |
; <label>:195: ; preds = %188, %108 | |
%196 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %196) #9 | |
br label %197 | |
; <label>:197: ; preds = %195 | |
br label %198 | |
; <label>:198: ; preds = %197, %80 | |
%199 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %199) #9 | |
store i32 16, i32* %26, align 4 | |
br label %200 | |
; <label>:200: ; preds = %212, %198 | |
%201 = load i32, i32* %26, align 4 | |
%202 = icmp sgt i32 %201, 0 | |
br i1 %202, label %205, label %203 | |
; <label>:203: ; preds = %200 | |
store i32 14, i32* %14, align 4 | |
%204 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %204) #9 | |
br label %215 | |
; <label>:205: ; preds = %200 | |
%206 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%207 = load float, float* %19, align 4 | |
%208 = load i32, i32* %26, align 4 | |
%209 = call float @_ZL11__shfl_downfji(float %207, i32 %208, i32 32) #10 | |
%210 = load float, float* %19, align 4 | |
%211 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %206, float %209, float %210) #10 | |
store float %211, float* %19, align 4 | |
br label %212 | |
; <label>:212: ; preds = %205 | |
%213 = load i32, i32* %26, align 4 | |
%214 = sdiv i32 %213, 2 | |
store i32 %214, i32* %26, align 4 | |
br label %200, !llvm.loop !64 | |
; <label>:215: ; preds = %203 | |
%216 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %216) #9 | |
%217 = load i32, i32* %12, align 4 | |
%218 = and i32 %217, 31 | |
store i32 %218, i32* %27, align 4 | |
%219 = load i32, i32* %27, align 4 | |
%220 = icmp eq i32 %219, 0 | |
br i1 %220, label %221, label %230 | |
; <label>:221: ; preds = %215 | |
%222 = load i32, i32* %18, align 4 | |
%223 = load i32, i32* %6, align 4 | |
%224 = icmp slt i32 %222, %223 | |
br i1 %224, label %225, label %230 | |
; <label>:225: ; preds = %221 | |
%226 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%227 = load i32, i32* %18, align 4 | |
%228 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"* %4, i32 %227) #10 | |
%229 = load float, float* %19, align 4 | |
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %226, float* %228, float %229) #10 | |
br label %230 | |
; <label>:230: ; preds = %225, %221, %215 | |
%231 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %231) #9 | |
%232 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %232) #9 | |
%233 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %233) #9 | |
%234 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %234) #9 | |
%235 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %235) #9 | |
%236 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %236) #9 | |
br label %237 | |
; <label>:237: ; preds = %230 | |
%238 = load i32, i32* %13, align 4 | |
%239 = add nsw i32 %238, 32 | |
store i32 %239, i32* %13, align 4 | |
br label %74 | |
; <label>:240: ; preds = %78 | |
%241 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %241) #9 | |
%242 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %242) #9 | |
%243 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %243) #9 | |
%244 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %244) #9 | |
%245 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %245) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32, align 4 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32 | |
%15 = alloca i32, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca i32, align 4 | |
%18 = alloca i32, align 4 | |
%19 = alloca float, align 4 | |
%20 = alloca i32, align 4 | |
%21 = alloca i32, align 4 | |
%22 = alloca i32, align 4 | |
%23 = alloca i32, align 4 | |
%24 = alloca i32, align 4 | |
%25 = alloca i32, align 4 | |
%26 = alloca i32, align 4 | |
%27 = alloca i32, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%29 = icmp eq i32 %28, 256 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %5 | |
br label %32 | |
; <label>:31: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%34 = icmp eq i32 %33, 1 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%44 = icmp eq i32 %43, 32 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%49 = icmp eq i32 %48, 1 | |
br i1 %49, label %50, label %51 | |
; <label>:50: ; preds = %47 | |
br label %52 | |
; <label>:51: ; preds = %47 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %52 | |
; <label>:52: ; preds = %51, %50 | |
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%54 = icmp eq i32 %53, 1 | |
br i1 %54, label %55, label %56 | |
; <label>:55: ; preds = %52 | |
br label %57 | |
; <label>:56: ; preds = %52 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %57 | |
; <label>:57: ; preds = %56, %55 | |
%58 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %58) #9 | |
store i32 16, i32* %8, align 4 | |
%59 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %7, align 4 | |
%61 = add nsw i32 %60, 32768 | |
%62 = sub nsw i32 %61, 1 | |
%63 = sdiv i32 %62, 32768 | |
store i32 %63, i32* %9, align 4 | |
%64 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %64) #9 | |
%65 = load i32, i32* %9, align 4 | |
%66 = load i32, i32* %6, align 4 | |
%67 = mul nsw i32 %65, %66 | |
store i32 %67, i32* %10, align 4 | |
%68 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %68) #9 | |
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %69, i32* %11, align 4 | |
%70 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %70) #9 | |
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %71, i32* %12, align 4 | |
%72 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %72) #9 | |
%73 = load i32, i32* %11, align 4 | |
store i32 %73, i32* %13, align 4 | |
br label %74 | |
; <label>:74: ; preds = %232, %57 | |
%75 = load i32, i32* %13, align 4 | |
%76 = load i32, i32* %10, align 4 | |
%77 = icmp slt i32 %75, %76 | |
br i1 %77, label %80, label %78 | |
; <label>:78: ; preds = %74 | |
store i32 2, i32* %14, align 4 | |
%79 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %79) #9 | |
br label %235 | |
; <label>:80: ; preds = %74 | |
%81 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %81) #9 | |
%82 = load i32, i32* %13, align 4 | |
%83 = load i32, i32* %9, align 4 | |
%84 = srem i32 %82, %83 | |
store i32 %84, i32* %15, align 4 | |
%85 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = load i32, i32* %13, align 4 | |
%87 = load i32, i32* %9, align 4 | |
%88 = sdiv i32 %86, %87 | |
store i32 %88, i32* %16, align 4 | |
%89 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %89) #9 | |
%90 = load i32, i32* %15, align 4 | |
%91 = mul nsw i32 %90, 256 | |
%92 = mul nsw i32 %91, 128 | |
%93 = load i32, i32* %12, align 4 | |
%94 = add nsw i32 %92, %93 | |
store i32 %94, i32* %17, align 4 | |
%95 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %95) #9 | |
%96 = load i32, i32* %16, align 4 | |
store i32 %96, i32* %18, align 4 | |
%97 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %97) #9 | |
%98 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10 | |
store float %98, float* %19, align 4 | |
%99 = load i32, i32* %18, align 4 | |
%100 = load i32, i32* %6, align 4 | |
%101 = icmp slt i32 %99, %100 | |
br i1 %101, label %102, label %195 | |
; <label>:102: ; preds = %80 | |
%103 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %103) #9 | |
store i32 0, i32* %20, align 4 | |
br label %104 | |
; <label>:104: ; preds = %189, %102 | |
%105 = load i32, i32* %20, align 4 | |
%106 = icmp slt i32 %105, 128 | |
br i1 %106, label %108, label %107 | |
; <label>:107: ; preds = %104 | |
store i32 5, i32* %14, align 4 | |
br label %192 | |
; <label>:108: ; preds = %104 | |
%109 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %109) #9 | |
%110 = load i32, i32* %17, align 4 | |
%111 = load i32, i32* %20, align 4 | |
%112 = add nsw i32 %111, 16 | |
%113 = sub nsw i32 %112, 1 | |
%114 = mul nsw i32 256, %113 | |
%115 = add nsw i32 %110, %114 | |
store i32 %115, i32* %21, align 4 | |
%116 = load i32, i32* %21, align 4 | |
%117 = load i32, i32* %7, align 4 | |
%118 = icmp sge i32 %116, %117 | |
br i1 %118, label %119, label %156 | |
; <label>:119: ; preds = %108 | |
%120 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %120) #9 | |
store i32 0, i32* %22, align 4 | |
br label %121 | |
; <label>:121: ; preds = %150, %119 | |
%122 = load i32, i32* %22, align 4 | |
%123 = icmp slt i32 %122, 15 | |
br i1 %123, label %125, label %124 | |
; <label>:124: ; preds = %121 | |
store i32 8, i32* %14, align 4 | |
br label %153 | |
; <label>:125: ; preds = %121 | |
%126 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %126) #9 | |
%127 = load i32, i32* %17, align 4 | |
%128 = load i32, i32* %20, align 4 | |
%129 = load i32, i32* %22, align 4 | |
%130 = add nsw i32 %128, %129 | |
%131 = mul nsw i32 256, %130 | |
%132 = add nsw i32 %127, %131 | |
store i32 %132, i32* %23, align 4 | |
%133 = load i32, i32* %23, align 4 | |
%134 = load i32, i32* %7, align 4 | |
%135 = icmp sge i32 %133, %134 | |
br i1 %135, label %136, label %137 | |
; <label>:136: ; preds = %125 | |
store i32 8, i32* %14, align 4 | |
br label %146 | |
; <label>:137: ; preds = %125 | |
%138 = load float, float* %19, align 4 | |
%139 = load i32, i32* %18, align 4 | |
%140 = load i32, i32* %7, align 4 | |
%141 = mul nsw i32 %139, %140 | |
%142 = load i32, i32* %23, align 4 | |
%143 = add nsw i32 %141, %142 | |
%144 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %143) #10 | |
%145 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %138, float %144) #10 | |
store float %145, float* %19, align 4 | |
store i32 0, i32* %14, align 4 | |
br label %146 | |
; <label>:146: ; preds = %137, %136 | |
%147 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %147) #9 | |
%148 = load i32, i32* %14, align 4 | |
switch i32 %148, label %153 [ | |
i32 0, label %149 | |
] | |
; <label>:149: ; preds = %146 | |
br label %150 | |
; <label>:150: ; preds = %149 | |
%151 = load i32, i32* %22, align 4 | |
%152 = add nsw i32 %151, 1 | |
store i32 %152, i32* %22, align 4 | |
br label %121, !llvm.loop !65 | |
; <label>:153: ; preds = %146, %124 | |
%154 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %154) #9 | |
br label %155 | |
; <label>:155: ; preds = %153 | |
store i32 5, i32* %14, align 4 | |
br label %185 | |
; <label>:156: ; preds = %108 | |
%157 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %157) #9 | |
store i32 0, i32* %24, align 4 | |
br label %158 | |
; <label>:158: ; preds = %180, %156 | |
%159 = load i32, i32* %24, align 4 | |
%160 = icmp slt i32 %159, 16 | |
br i1 %160, label %163, label %161 | |
; <label>:161: ; preds = %158 | |
store i32 11, i32* %14, align 4 | |
%162 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %162) #9 | |
br label %183 | |
; <label>:163: ; preds = %158 | |
%164 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %164) #9 | |
%165 = load i32, i32* %17, align 4 | |
%166 = load i32, i32* %20, align 4 | |
%167 = load i32, i32* %24, align 4 | |
%168 = add nsw i32 %166, %167 | |
%169 = mul nsw i32 256, %168 | |
%170 = add nsw i32 %165, %169 | |
store i32 %170, i32* %25, align 4 | |
%171 = load float, float* %19, align 4 | |
%172 = load i32, i32* %18, align 4 | |
%173 = load i32, i32* %7, align 4 | |
%174 = mul nsw i32 %172, %173 | |
%175 = load i32, i32* %25, align 4 | |
%176 = add nsw i32 %174, %175 | |
%177 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %176) #10 | |
%178 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %171, float %177) #10 | |
store float %178, float* %19, align 4 | |
%179 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %179) #9 | |
br label %180 | |
; <label>:180: ; preds = %163 | |
%181 = load i32, i32* %24, align 4 | |
%182 = add nsw i32 %181, 1 | |
store i32 %182, i32* %24, align 4 | |
br label %158, !llvm.loop !66 | |
; <label>:183: ; preds = %161 | |
br label %184 | |
; <label>:184: ; preds = %183 | |
store i32 0, i32* %14, align 4 | |
br label %185 | |
; <label>:185: ; preds = %184, %155 | |
%186 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %186) #9 | |
%187 = load i32, i32* %14, align 4 | |
switch i32 %187, label %192 [ | |
i32 0, label %188 | |
] | |
; <label>:188: ; preds = %185 | |
br label %189 | |
; <label>:189: ; preds = %188 | |
%190 = load i32, i32* %20, align 4 | |
%191 = add nsw i32 %190, 16 | |
store i32 %191, i32* %20, align 4 | |
br label %104, !llvm.loop !67 | |
; <label>:192: ; preds = %185, %107 | |
%193 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %193) #9 | |
br label %194 | |
; <label>:194: ; preds = %192 | |
br label %195 | |
; <label>:195: ; preds = %194, %80 | |
%196 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %196) #9 | |
store i32 16, i32* %26, align 4 | |
br label %197 | |
; <label>:197: ; preds = %208, %195 | |
%198 = load i32, i32* %26, align 4 | |
%199 = icmp sgt i32 %198, 0 | |
br i1 %199, label %202, label %200 | |
; <label>:200: ; preds = %197 | |
store i32 14, i32* %14, align 4 | |
%201 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %201) #9 | |
br label %211 | |
; <label>:202: ; preds = %197 | |
%203 = load float, float* %19, align 4 | |
%204 = load i32, i32* %26, align 4 | |
%205 = call float @_ZL11__shfl_downfji(float %203, i32 %204, i32 32) #10 | |
%206 = load float, float* %19, align 4 | |
%207 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %205, float %206) #10 | |
store float %207, float* %19, align 4 | |
br label %208 | |
; <label>:208: ; preds = %202 | |
%209 = load i32, i32* %26, align 4 | |
%210 = sdiv i32 %209, 2 | |
store i32 %210, i32* %26, align 4 | |
br label %197, !llvm.loop !68 | |
; <label>:211: ; preds = %200 | |
%212 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %212) #9 | |
%213 = load i32, i32* %12, align 4 | |
%214 = and i32 %213, 31 | |
store i32 %214, i32* %27, align 4 | |
%215 = load i32, i32* %27, align 4 | |
%216 = icmp eq i32 %215, 0 | |
br i1 %216, label %217, label %225 | |
; <label>:217: ; preds = %211 | |
%218 = load i32, i32* %18, align 4 | |
%219 = load i32, i32* %6, align 4 | |
%220 = icmp slt i32 %218, %219 | |
br i1 %220, label %221, label %225 | |
; <label>:221: ; preds = %217 | |
%222 = load i32, i32* %18, align 4 | |
%223 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"* %4, i32 %222) #10 | |
%224 = load float, float* %19, align 4 | |
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %223, float %224) #10 | |
br label %225 | |
; <label>:225: ; preds = %221, %217, %211 | |
%226 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %226) #9 | |
%227 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %227) #9 | |
%228 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %228) #9 | |
%229 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %229) #9 | |
%230 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %230) #9 | |
%231 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %231) #9 | |
br label %232 | |
; <label>:232: ; preds = %225 | |
%233 = load i32, i32* %13, align 4 | |
%234 = add nsw i32 %233, 32 | |
store i32 %234, i32* %13, align 4 | |
br label %74 | |
; <label>:235: ; preds = %78 | |
%236 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %236) #9 | |
%237 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %237) #9 | |
%238 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %238) #9 | |
%239 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %239) #9 | |
%240 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %240) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32, align 4 | |
%15 = alloca float, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca float, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%19 = icmp eq i32 %18, 256 | |
br i1 %19, label %20, label %21 | |
; <label>:20: ; preds = %5 | |
br label %22 | |
; <label>:21: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %22 | |
; <label>:22: ; preds = %21, %20 | |
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%24 = icmp eq i32 %23, 1 | |
br i1 %24, label %25, label %26 | |
; <label>:25: ; preds = %22 | |
br label %27 | |
; <label>:26: ; preds = %22 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %27 | |
; <label>:27: ; preds = %26, %25 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%29 = icmp eq i32 %28, 1 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %27 | |
br label %32 | |
; <label>:31: ; preds = %27 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%34 = icmp eq i32 %33, 128 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%44 = icmp eq i32 %43, 1 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %48) #9 | |
%49 = load i32, i32* %6, align 4 | |
%50 = add nsw i32 %49, 16 | |
%51 = sub nsw i32 %50, 1 | |
%52 = sdiv i32 %51, 16 | |
%53 = load i32, i32* %7, align 4 | |
%54 = mul nsw i32 %52, %53 | |
store i32 %54, i32* %8, align 4 | |
%55 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %55) #9 | |
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %56, i32* %9, align 4 | |
%57 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %57) #9 | |
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %58, i32* %10, align 4 | |
%59 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %9, align 4 | |
%61 = mul nsw i32 %60, 256 | |
%62 = load i32, i32* %10, align 4 | |
%63 = add nsw i32 %61, %62 | |
store i32 %63, i32* %11, align 4 | |
br label %64 | |
; <label>:64: ; preds = %135, %47 | |
%65 = load i32, i32* %11, align 4 | |
%66 = load i32, i32* %8, align 4 | |
%67 = icmp slt i32 %65, %66 | |
br i1 %67, label %70, label %68 | |
; <label>:68: ; preds = %64 | |
store i32 2, i32* %12, align 4 | |
%69 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %69) #9 | |
br label %138 | |
; <label>:70: ; preds = %64 | |
%71 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %71) #9 | |
%72 = load i32, i32* %11, align 4 | |
%73 = load i32, i32* %7, align 4 | |
%74 = srem i32 %72, %73 | |
store i32 %74, i32* %13, align 4 | |
%75 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %75) #9 | |
%76 = load i32, i32* %11, align 4 | |
%77 = load i32, i32* %7, align 4 | |
%78 = sdiv i32 %76, %77 | |
%79 = load i32, i32* %6, align 4 | |
%80 = add nsw i32 %79, 16 | |
%81 = sub nsw i32 %80, 1 | |
%82 = sdiv i32 %81, 16 | |
%83 = srem i32 %78, %82 | |
%84 = mul nsw i32 %83, 16 | |
store i32 %84, i32* %14, align 4 | |
%85 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%87 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %86) #10 | |
store float %87, float* %15, align 4 | |
%88 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %88) #9 | |
store i32 0, i32* %16, align 4 | |
br label %89 | |
; <label>:89: ; preds = %124, %70 | |
%90 = load i32, i32* %16, align 4 | |
%91 = icmp slt i32 %90, 16 | |
br i1 %91, label %94, label %92 | |
; <label>:92: ; preds = %89 | |
store i32 5, i32* %12, align 4 | |
%93 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %93) #9 | |
br label %127 | |
; <label>:94: ; preds = %89 | |
%95 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %95) #9 | |
%96 = load i32, i32* %13, align 4 | |
%97 = load i32, i32* %7, align 4 | |
%98 = icmp slt i32 %96, %97 | |
br i1 %98, label %99, label %114 | |
; <label>:99: ; preds = %94 | |
%100 = load i32, i32* %14, align 4 | |
%101 = load i32, i32* %16, align 4 | |
%102 = add nsw i32 %100, %101 | |
%103 = load i32, i32* %6, align 4 | |
%104 = icmp slt i32 %102, %103 | |
br i1 %104, label %105, label %114 | |
; <label>:105: ; preds = %99 | |
%106 = load i32, i32* %14, align 4 | |
%107 = load i32, i32* %16, align 4 | |
%108 = add nsw i32 %106, %107 | |
%109 = load i32, i32* %7, align 4 | |
%110 = mul nsw i32 %108, %109 | |
%111 = load i32, i32* %13, align 4 | |
%112 = add nsw i32 %110, %111 | |
%113 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %112) #10 | |
br label %117 | |
; <label>:114: ; preds = %99, %94 | |
%115 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%116 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %115) #10 | |
br label %117 | |
; <label>:117: ; preds = %114, %105 | |
%118 = phi float [ %113, %105 ], [ %116, %114 ] | |
store float %118, float* %17, align 4 | |
%119 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%120 = load float, float* %15, align 4 | |
%121 = load float, float* %17, align 4 | |
%122 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %119, float %120, float %121) #10 | |
store float %122, float* %15, align 4 | |
%123 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %123) #9 | |
br label %124 | |
; <label>:124: ; preds = %117 | |
%125 = load i32, i32* %16, align 4 | |
%126 = add nsw i32 %125, 1 | |
store i32 %126, i32* %16, align 4 | |
br label %89 | |
; <label>:127: ; preds = %92 | |
%128 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%129 = load i32, i32* %13, align 4 | |
%130 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"* %4, i32 %129) #10 | |
%131 = load float, float* %15, align 4 | |
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %128, float* %130, float %131) #10 | |
%132 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %132) #9 | |
%133 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %133) #9 | |
%134 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %134) #9 | |
br label %135 | |
; <label>:135: ; preds = %127 | |
%136 = load i32, i32* %11, align 4 | |
%137 = add nsw i32 %136, 32768 | |
store i32 %137, i32* %11, align 4 | |
br label %64 | |
; <label>:138: ; preds = %68 | |
%139 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %139) #9 | |
%140 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %140) #9 | |
%141 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %141) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32, align 4 | |
%15 = alloca float, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca float, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%19 = icmp eq i32 %18, 256 | |
br i1 %19, label %20, label %21 | |
; <label>:20: ; preds = %5 | |
br label %22 | |
; <label>:21: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %22 | |
; <label>:22: ; preds = %21, %20 | |
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%24 = icmp eq i32 %23, 1 | |
br i1 %24, label %25, label %26 | |
; <label>:25: ; preds = %22 | |
br label %27 | |
; <label>:26: ; preds = %22 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %27 | |
; <label>:27: ; preds = %26, %25 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%29 = icmp eq i32 %28, 1 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %27 | |
br label %32 | |
; <label>:31: ; preds = %27 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%34 = icmp eq i32 %33, 128 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%44 = icmp eq i32 %43, 1 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %48) #9 | |
%49 = load i32, i32* %6, align 4 | |
%50 = add nsw i32 %49, 16 | |
%51 = sub nsw i32 %50, 1 | |
%52 = sdiv i32 %51, 16 | |
%53 = load i32, i32* %7, align 4 | |
%54 = mul nsw i32 %52, %53 | |
store i32 %54, i32* %8, align 4 | |
%55 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %55) #9 | |
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %56, i32* %9, align 4 | |
%57 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %57) #9 | |
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %58, i32* %10, align 4 | |
%59 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %9, align 4 | |
%61 = mul nsw i32 %60, 256 | |
%62 = load i32, i32* %10, align 4 | |
%63 = add nsw i32 %61, %62 | |
store i32 %63, i32* %11, align 4 | |
br label %64 | |
; <label>:64: ; preds = %131, %47 | |
%65 = load i32, i32* %11, align 4 | |
%66 = load i32, i32* %8, align 4 | |
%67 = icmp slt i32 %65, %66 | |
br i1 %67, label %70, label %68 | |
; <label>:68: ; preds = %64 | |
store i32 2, i32* %12, align 4 | |
%69 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %69) #9 | |
br label %134 | |
; <label>:70: ; preds = %64 | |
%71 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %71) #9 | |
%72 = load i32, i32* %11, align 4 | |
%73 = load i32, i32* %7, align 4 | |
%74 = srem i32 %72, %73 | |
store i32 %74, i32* %13, align 4 | |
%75 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %75) #9 | |
%76 = load i32, i32* %11, align 4 | |
%77 = load i32, i32* %7, align 4 | |
%78 = sdiv i32 %76, %77 | |
%79 = load i32, i32* %6, align 4 | |
%80 = add nsw i32 %79, 16 | |
%81 = sub nsw i32 %80, 1 | |
%82 = sdiv i32 %81, 16 | |
%83 = srem i32 %78, %82 | |
%84 = mul nsw i32 %83, 16 | |
store i32 %84, i32* %14, align 4 | |
%85 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10 | |
store float %86, float* %15, align 4 | |
%87 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %87) #9 | |
store i32 0, i32* %16, align 4 | |
br label %88 | |
; <label>:88: ; preds = %121, %70 | |
%89 = load i32, i32* %16, align 4 | |
%90 = icmp slt i32 %89, 16 | |
br i1 %90, label %93, label %91 | |
; <label>:91: ; preds = %88 | |
store i32 5, i32* %12, align 4 | |
%92 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %92) #9 | |
br label %124 | |
; <label>:93: ; preds = %88 | |
%94 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %94) #9 | |
%95 = load i32, i32* %13, align 4 | |
%96 = load i32, i32* %7, align 4 | |
%97 = icmp slt i32 %95, %96 | |
br i1 %97, label %98, label %113 | |
; <label>:98: ; preds = %93 | |
%99 = load i32, i32* %14, align 4 | |
%100 = load i32, i32* %16, align 4 | |
%101 = add nsw i32 %99, %100 | |
%102 = load i32, i32* %6, align 4 | |
%103 = icmp slt i32 %101, %102 | |
br i1 %103, label %104, label %113 | |
; <label>:104: ; preds = %98 | |
%105 = load i32, i32* %14, align 4 | |
%106 = load i32, i32* %16, align 4 | |
%107 = add nsw i32 %105, %106 | |
%108 = load i32, i32* %7, align 4 | |
%109 = mul nsw i32 %107, %108 | |
%110 = load i32, i32* %13, align 4 | |
%111 = add nsw i32 %109, %110 | |
%112 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %111) #10 | |
br label %115 | |
; <label>:113: ; preds = %98, %93 | |
%114 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10 | |
br label %115 | |
; <label>:115: ; preds = %113, %104 | |
%116 = phi float [ %112, %104 ], [ %114, %113 ] | |
store float %116, float* %17, align 4 | |
%117 = load float, float* %15, align 4 | |
%118 = load float, float* %17, align 4 | |
%119 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %117, float %118) #10 | |
store float %119, float* %15, align 4 | |
%120 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %120) #9 | |
br label %121 | |
; <label>:121: ; preds = %115 | |
%122 = load i32, i32* %16, align 4 | |
%123 = add nsw i32 %122, 1 | |
store i32 %123, i32* %16, align 4 | |
br label %88 | |
; <label>:124: ; preds = %91 | |
%125 = load i32, i32* %13, align 4 | |
%126 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"* %4, i32 %125) #10 | |
%127 = load float, float* %15, align 4 | |
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %126, float %127) #10 | |
%128 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %128) #9 | |
%129 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %129) #9 | |
%130 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %130) #9 | |
br label %131 | |
; <label>:131: ; preds = %124 | |
%132 = load i32, i32* %11, align 4 | |
%133 = add nsw i32 %132, 32768 | |
store i32 %133, i32* %11, align 4 | |
br label %64 | |
; <label>:134: ; preds = %68 | |
%135 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %135) #9 | |
%136 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %136) #9 | |
%137 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %137) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, float*) #0 comdat { | |
%5 = alloca i32, align 4 | |
%6 = alloca float*, align 8 | |
%7 = alloca i32, align 4 | |
%8 = alloca float, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32, align 4 | |
%13 = alloca i32, align 4 | |
%14 = alloca float, align 4 | |
%15 = alloca i32, align 4 | |
store i32 %2, i32* %5, align 4 | |
store float* %3, float** %6, align 8 | |
%16 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %16) #9 | |
%17 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%18 = mul i32 %17, 256 | |
%19 = mul i32 %18, 128 | |
%20 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%21 = add i32 %19, %20 | |
store i32 %21, i32* %7, align 4 | |
%22 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %24, label %31 | |
; <label>:24: ; preds = %4 | |
%25 = load i32, i32* %7, align 4 | |
%26 = icmp eq i32 %25, 0 | |
br i1 %26, label %27, label %30 | |
; <label>:27: ; preds = %24 | |
%28 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10 | |
%29 = load float*, float** %6, align 8 | |
store float %28, float* %29, align 4 | |
br label %30 | |
; <label>:30: ; preds = %27, %24 | |
call void @llvm.cuda.syncthreads() | |
br label %31 | |
; <label>:31: ; preds = %30, %4 | |
%32 = bitcast float* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %32) #9 | |
%33 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10 | |
store float %33, float* %8, align 4 | |
%34 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %34) #9 | |
%35 = load i32, i32* %5, align 4 | |
%36 = load i32, i32* %7, align 4 | |
%37 = sub nsw i32 %35, %36 | |
store i32 %37, i32* %10, align 4 | |
store i32 32768, i32* %11, align 4 | |
%38 = call i32 @_ZN5Eigen6numext4miniIiEET_RKS2_S4_(i32* dereferenceable(4) %10, i32* dereferenceable(4) %11) #10 | |
store i32 %38, i32* %9, align 4 | |
%39 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %39) #9 | |
store i32 0, i32* %12, align 4 | |
br label %40 | |
; <label>:40: ; preds = %58, %31 | |
%41 = load i32, i32* %12, align 4 | |
%42 = load i32, i32* %9, align 4 | |
%43 = icmp slt i32 %41, %42 | |
br i1 %43, label %46, label %44 | |
; <label>:44: ; preds = %40 | |
%45 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %45) #9 | |
br label %61 | |
; <label>:46: ; preds = %40 | |
%47 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %47) #9 | |
%48 = load i32, i32* %7, align 4 | |
%49 = load i32, i32* %12, align 4 | |
%50 = add nsw i32 %48, %49 | |
store i32 %50, i32* %13, align 4 | |
%51 = bitcast float* %14 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %51) #9 | |
%52 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i32 0, i32 10 | |
%53 = load i32, i32* %13, align 4 | |
%54 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %52, i32 %53) #10 | |
store float %54, float* %14, align 4 | |
%55 = load float, float* %14, align 4 | |
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %55, float* %8) #10 | |
%56 = bitcast float* %14 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %56) #9 | |
%57 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %57) #9 | |
br label %58 | |
; <label>:58: ; preds = %46 | |
%59 = load i32, i32* %12, align 4 | |
%60 = add nsw i32 %59, 256 | |
store i32 %60, i32* %12, align 4 | |
br label %40, !llvm.loop !69 | |
; <label>:61: ; preds = %44 | |
%62 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %62) #9 | |
store i32 16, i32* %15, align 4 | |
br label %63 | |
; <label>:63: ; preds = %72, %61 | |
%64 = load i32, i32* %15, align 4 | |
%65 = icmp sgt i32 %64, 0 | |
br i1 %65, label %68, label %66 | |
; <label>:66: ; preds = %63 | |
%67 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %67) #9 | |
br label %75 | |
; <label>:68: ; preds = %63 | |
%69 = load float, float* %8, align 4 | |
%70 = load i32, i32* %15, align 4 | |
%71 = call float @_ZL11__shfl_downfji(float %69, i32 %70, i32 32) #10 | |
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %71, float* %8) #10 | |
br label %72 | |
; <label>:72: ; preds = %68 | |
%73 = load i32, i32* %15, align 4 | |
%74 = sdiv i32 %73, 2 | |
store i32 %74, i32* %15, align 4 | |
br label %63, !llvm.loop !70 | |
; <label>:75: ; preds = %66 | |
%76 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%77 = and i32 %76, 31 | |
%78 = icmp eq i32 %77, 0 | |
br i1 %78, label %79, label %82 | |
; <label>:79: ; preds = %75 | |
%80 = load float*, float** %6, align 8 | |
%81 = load float, float* %8, align 4 | |
call void @_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE(float* %80, float %81, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %0) #10 | |
br label %82 | |
; <label>:82: ; preds = %79, %75 | |
%83 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %83) #9 | |
%84 = bitcast float* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %84) #9 | |
%85 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %85) #9 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator.13"*, align 8 | |
%4 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator.13"* %0, %"struct.Eigen::TensorEvaluator.13"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%5 = load %"struct.Eigen::TensorEvaluator.13"*, %"struct.Eigen::TensorEvaluator.13"** %3, align 8 | |
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %5, i32 0, i32 0 | |
%7 = load float*, float** %6, align 8 | |
%8 = load i32, i32* %4, align 4 | |
%9 = sext i32 %8 to i64 | |
%10 = getelementptr inbounds float, float* %7, i64 %9 | |
%11 = call float @_ZN5Eigen12_GLOBAL__N_112loadConstantIfEET_PKS2_(float* %10) #10 | |
ret float %11 | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, i32, float*) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca float*, align 8 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32, align 4 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32, align 4 | |
%15 = alloca i32, align 4 | |
%16 = alloca i32 | |
%17 = alloca i32, align 4 | |
%18 = alloca i32, align 4 | |
%19 = alloca i32, align 4 | |
%20 = alloca float, align 4 | |
%21 = alloca i32, align 4 | |
%22 = alloca i32, align 4 | |
%23 = alloca i32, align 4 | |
%24 = alloca float, align 4 | |
%25 = alloca i32, align 4 | |
%26 = alloca i32, align 4 | |
%27 = alloca i32, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
store float* %4, float** %8, align 8 | |
%28 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %28) #9 | |
store i32 16, i32* %9, align 4 | |
%29 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %29) #9 | |
%30 = load i32, i32* %6, align 4 | |
%31 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%32 = mul i32 %31, 128 | |
%33 = call i32 @_ZN5Eigen5divupIiijEET_T0_T1_(i32 %30, i32 %32) #10 | |
store i32 %33, i32* %10, align 4 | |
%34 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %34) #9 | |
%35 = load i32, i32* %10, align 4 | |
%36 = load i32, i32* %7, align 4 | |
%37 = mul nsw i32 %35, %36 | |
store i32 %37, i32* %11, align 4 | |
%38 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %38) #9 | |
%39 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%40 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%41 = mul i32 %39, %40 | |
store i32 %41, i32* %12, align 4 | |
%42 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %42) #9 | |
%43 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%44 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%45 = mul i32 %43, %44 | |
%46 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%47 = add i32 %45, %46 | |
store i32 %47, i32* %13, align 4 | |
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%49 = icmp eq i32 %48, 1 | |
br i1 %49, label %50, label %70 | |
; <label>:50: ; preds = %5 | |
%51 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %51) #9 | |
%52 = load i32, i32* %13, align 4 | |
store i32 %52, i32* %14, align 4 | |
br label %53 | |
; <label>:53: ; preds = %65, %50 | |
%54 = load i32, i32* %14, align 4 | |
%55 = load i32, i32* %7, align 4 | |
%56 = icmp slt i32 %54, %55 | |
br i1 %56, label %59, label %57 | |
; <label>:57: ; preds = %53 | |
%58 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %58) #9 | |
br label %69 | |
; <label>:59: ; preds = %53 | |
%60 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10 | |
%61 = load i32, i32* %14, align 4 | |
%62 = sext i32 %61 to i64 | |
%63 = load float*, float** %8, align 8 | |
%64 = getelementptr inbounds float, float* %63, i64 %62 | |
store float %60, float* %64, align 4 | |
br label %65 | |
; <label>:65: ; preds = %59 | |
%66 = load i32, i32* %12, align 4 | |
%67 = load i32, i32* %14, align 4 | |
%68 = add nsw i32 %67, %66 | |
store i32 %68, i32* %14, align 4 | |
br label %53 | |
; <label>:69: ; preds = %57 | |
br label %70 | |
; <label>:70: ; preds = %69, %5 | |
%71 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %71) #9 | |
%72 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %72, i32* %15, align 4 | |
br label %73 | |
; <label>:73: ; preds = %215, %70 | |
%74 = load i32, i32* %15, align 4 | |
%75 = load i32, i32* %11, align 4 | |
%76 = icmp slt i32 %74, %75 | |
br i1 %76, label %79, label %77 | |
; <label>:77: ; preds = %73 | |
store i32 5, i32* %16, align 4 | |
%78 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %78) #9 | |
br label %219 | |
; <label>:79: ; preds = %73 | |
%80 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %80) #9 | |
%81 = load i32, i32* %15, align 4 | |
%82 = load i32, i32* %10, align 4 | |
%83 = sdiv i32 %81, %82 | |
store i32 %83, i32* %17, align 4 | |
%84 = load i32, i32* %17, align 4 | |
%85 = load i32, i32* %7, align 4 | |
%86 = icmp slt i32 %84, %85 | |
br i1 %86, label %87, label %213 | |
; <label>:87: ; preds = %79 | |
%88 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %88) #9 | |
%89 = load i32, i32* %15, align 4 | |
%90 = load i32, i32* %10, align 4 | |
%91 = srem i32 %89, %90 | |
store i32 %91, i32* %18, align 4 | |
%92 = bitcast i32* %19 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %92) #9 | |
%93 = load i32, i32* %18, align 4 | |
%94 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%95 = mul i32 %93, %94 | |
%96 = mul i32 %95, 128 | |
%97 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%98 = add i32 %96, %97 | |
store i32 %98, i32* %19, align 4 | |
%99 = bitcast float* %20 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %99) #9 | |
%100 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10 | |
store float %100, float* %20, align 4 | |
%101 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %101) #9 | |
store i32 0, i32* %21, align 4 | |
br label %102 | |
; <label>:102: ; preds = %180, %87 | |
%103 = load i32, i32* %21, align 4 | |
%104 = icmp slt i32 %103, 128 | |
br i1 %104, label %106, label %105 | |
; <label>:105: ; preds = %102 | |
store i32 8, i32* %16, align 4 | |
br label %183 | |
; <label>:106: ; preds = %102 | |
%107 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %107) #9 | |
%108 = load i32, i32* %19, align 4 | |
%109 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%110 = load i32, i32* %21, align 4 | |
%111 = add nsw i32 %110, 16 | |
%112 = sub nsw i32 %111, 1 | |
%113 = mul i32 %109, %112 | |
%114 = add i32 %108, %113 | |
store i32 %114, i32* %22, align 4 | |
%115 = load i32, i32* %22, align 4 | |
%116 = load i32, i32* %6, align 4 | |
%117 = icmp sge i32 %115, %116 | |
br i1 %117, label %118, label %147 | |
; <label>:118: ; preds = %106 | |
%119 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %119) #9 | |
%120 = load i32, i32* %19, align 4 | |
%121 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%122 = load i32, i32* %21, align 4 | |
%123 = mul i32 %121, %122 | |
%124 = add i32 %120, %123 | |
store i32 %124, i32* %23, align 4 | |
br label %125 | |
; <label>:125: ; preds = %142, %118 | |
%126 = load i32, i32* %23, align 4 | |
%127 = load i32, i32* %6, align 4 | |
%128 = icmp slt i32 %126, %127 | |
br i1 %128, label %131, label %129 | |
; <label>:129: ; preds = %125 | |
store i32 11, i32* %16, align 4 | |
%130 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %130) #9 | |
br label %146 | |
; <label>:131: ; preds = %125 | |
%132 = bitcast float* %24 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %132) #9 | |
%133 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i32 0, i32 10 | |
%134 = load i32, i32* %17, align 4 | |
%135 = load i32, i32* %6, align 4 | |
%136 = mul nsw i32 %134, %135 | |
%137 = load i32, i32* %23, align 4 | |
%138 = add nsw i32 %136, %137 | |
%139 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %133, i32 %138) #10 | |
store float %139, float* %24, align 4 | |
%140 = load float, float* %24, align 4 | |
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %140, float* %20) #10 | |
%141 = bitcast float* %24 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %141) #9 | |
br label %142 | |
; <label>:142: ; preds = %131 | |
%143 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%144 = load i32, i32* %23, align 4 | |
%145 = add i32 %144, %143 | |
store i32 %145, i32* %23, align 4 | |
br label %125 | |
; <label>:146: ; preds = %129 | |
store i32 8, i32* %16, align 4 | |
br label %176 | |
; <label>:147: ; preds = %106 | |
%148 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %148) #9 | |
store i32 0, i32* %25, align 4 | |
br label %149 | |
; <label>:149: ; preds = %171, %147 | |
%150 = load i32, i32* %25, align 4 | |
%151 = icmp slt i32 %150, 16 | |
br i1 %151, label %154, label %152 | |
; <label>:152: ; preds = %149 | |
store i32 14, i32* %16, align 4 | |
%153 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %153) #9 | |
br label %174 | |
; <label>:154: ; preds = %149 | |
%155 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %155) #9 | |
%156 = load i32, i32* %19, align 4 | |
%157 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%158 = load i32, i32* %21, align 4 | |
%159 = load i32, i32* %25, align 4 | |
%160 = add nsw i32 %158, %159 | |
%161 = mul i32 %157, %160 | |
%162 = add i32 %156, %161 | |
store i32 %162, i32* %26, align 4 | |
%163 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i32 0, i32 10 | |
%164 = load i32, i32* %17, align 4 | |
%165 = load i32, i32* %6, align 4 | |
%166 = mul nsw i32 %164, %165 | |
%167 = load i32, i32* %26, align 4 | |
%168 = add nsw i32 %166, %167 | |
%169 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %163, i32 %168) #10 | |
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %169, float* %20) #10 | |
%170 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %170) #9 | |
br label %171 | |
; <label>:171: ; preds = %154 | |
%172 = load i32, i32* %25, align 4 | |
%173 = add nsw i32 %172, 1 | |
store i32 %173, i32* %25, align 4 | |
br label %149, !llvm.loop !71 | |
; <label>:174: ; preds = %152 | |
br label %175 | |
; <label>:175: ; preds = %174 | |
store i32 0, i32* %16, align 4 | |
br label %176 | |
; <label>:176: ; preds = %175, %146 | |
%177 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %177) #9 | |
%178 = load i32, i32* %16, align 4 | |
switch i32 %178, label %183 [ | |
i32 0, label %179 | |
] | |
; <label>:179: ; preds = %176 | |
br label %180 | |
; <label>:180: ; preds = %179 | |
%181 = load i32, i32* %21, align 4 | |
%182 = add nsw i32 %181, 16 | |
store i32 %182, i32* %21, align 4 | |
br label %102 | |
; <label>:183: ; preds = %176, %105 | |
%184 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %184) #9 | |
br label %185 | |
; <label>:185: ; preds = %183 | |
call void @llvm.cuda.syncthreads() | |
%186 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %186) #9 | |
store i32 16, i32* %27, align 4 | |
br label %187 | |
; <label>:187: ; preds = %196, %185 | |
%188 = load i32, i32* %27, align 4 | |
%189 = icmp sgt i32 %188, 0 | |
br i1 %189, label %192, label %190 | |
; <label>:190: ; preds = %187 | |
store i32 17, i32* %16, align 4 | |
%191 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %191) #9 | |
br label %199 | |
; <label>:192: ; preds = %187 | |
%193 = load float, float* %20, align 4 | |
%194 = load i32, i32* %27, align 4 | |
%195 = call float @_ZL11__shfl_downfji(float %193, i32 %194, i32 32) #10 | |
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %195, float* %20) #10 | |
br label %196 | |
; <label>:196: ; preds = %192 | |
%197 = load i32, i32* %27, align 4 | |
%198 = sdiv i32 %197, 2 | |
store i32 %198, i32* %27, align 4 | |
br label %187 | |
; <label>:199: ; preds = %190 | |
%200 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%201 = and i32 %200, 31 | |
%202 = icmp eq i32 %201, 0 | |
br i1 %202, label %203, label %209 | |
; <label>:203: ; preds = %199 | |
%204 = load i32, i32* %17, align 4 | |
%205 = sext i32 %204 to i64 | |
%206 = load float*, float** %8, align 8 | |
%207 = getelementptr inbounds float, float* %206, i64 %205 | |
%208 = load float, float* %20, align 4 | |
call void @_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE(float* %207, float %208, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %0) #10 | |
br label %209 | |
; <label>:209: ; preds = %203, %199 | |
%210 = bitcast float* %20 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %210) #9 | |
%211 = bitcast i32* %19 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %211) #9 | |
%212 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %212) #9 | |
br label %213 | |
; <label>:213: ; preds = %209, %79 | |
call void @llvm.cuda.syncthreads() | |
%214 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %214) #9 | |
br label %215 | |
; <label>:215: ; preds = %213 | |
%216 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%217 = load i32, i32* %15, align 4 | |
%218 = add i32 %217, %216 | |
store i32 %218, i32* %15, align 4 | |
br label %73 | |
; <label>:219: ; preds = %77 | |
%220 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %220) #9 | |
%221 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %221) #9 | |
%222 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %222) #9 | |
%223 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %223) #9 | |
%224 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %224) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, i32, float*) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca float*, align 8 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32, align 4 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32 | |
%15 = alloca i32, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca float, align 4 | |
%18 = alloca i32, align 4 | |
%19 = alloca i32, align 4 | |
%20 = alloca i32, align 4 | |
%21 = alloca float, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
store float* %4, float** %8, align 8 | |
%22 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %22) #9 | |
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%24 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%25 = mul i32 %23, %24 | |
store i32 %25, i32* %9, align 4 | |
%26 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %26) #9 | |
%27 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%29 = mul i32 %27, %28 | |
%30 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%31 = add i32 %29, %30 | |
store i32 %31, i32* %10, align 4 | |
%32 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%33 = icmp eq i32 %32, 1 | |
br i1 %33, label %34, label %54 | |
; <label>:34: ; preds = %5 | |
%35 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %35) #9 | |
%36 = load i32, i32* %10, align 4 | |
store i32 %36, i32* %11, align 4 | |
br label %37 | |
; <label>:37: ; preds = %49, %34 | |
%38 = load i32, i32* %11, align 4 | |
%39 = load i32, i32* %7, align 4 | |
%40 = icmp slt i32 %38, %39 | |
br i1 %40, label %43, label %41 | |
; <label>:41: ; preds = %37 | |
%42 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %42) #9 | |
br label %53 | |
; <label>:43: ; preds = %37 | |
%44 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10 | |
%45 = load i32, i32* %11, align 4 | |
%46 = sext i32 %45 to i64 | |
%47 = load float*, float** %8, align 8 | |
%48 = getelementptr inbounds float, float* %47, i64 %46 | |
store float %44, float* %48, align 4 | |
br label %49 | |
; <label>:49: ; preds = %43 | |
%50 = load i32, i32* %9, align 4 | |
%51 = load i32, i32* %11, align 4 | |
%52 = add nsw i32 %51, %50 | |
store i32 %52, i32* %11, align 4 | |
br label %37 | |
; <label>:53: ; preds = %41 | |
call void @llvm.cuda.syncthreads() | |
br label %54 | |
; <label>:54: ; preds = %53, %5 | |
%55 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %55) #9 | |
%56 = load i32, i32* %7, align 4 | |
%57 = load i32, i32* %6, align 4 | |
%58 = call i32 @_ZN5Eigen5divupIiEET_S1_S1_(i32 %57, i32 16) #10 | |
%59 = mul nsw i32 %56, %58 | |
store i32 %59, i32* %12, align 4 | |
%60 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %60) #9 | |
%61 = load i32, i32* %10, align 4 | |
store i32 %61, i32* %13, align 4 | |
br label %62 | |
; <label>:62: ; preds = %116, %54 | |
%63 = load i32, i32* %13, align 4 | |
%64 = load i32, i32* %12, align 4 | |
%65 = icmp slt i32 %63, %64 | |
br i1 %65, label %68, label %66 | |
; <label>:66: ; preds = %62 | |
store i32 5, i32* %14, align 4 | |
%67 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %67) #9 | |
br label %120 | |
; <label>:68: ; preds = %62 | |
%69 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %69) #9 | |
%70 = load i32, i32* %13, align 4 | |
%71 = load i32, i32* %7, align 4 | |
%72 = srem i32 %70, %71 | |
store i32 %72, i32* %15, align 4 | |
%73 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %73) #9 | |
%74 = load i32, i32* %13, align 4 | |
%75 = load i32, i32* %7, align 4 | |
%76 = sdiv i32 %74, %75 | |
%77 = mul nsw i32 %76, 16 | |
store i32 %77, i32* %16, align 4 | |
%78 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %78) #9 | |
%79 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10 | |
store float %79, float* %17, align 4 | |
%80 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %80) #9 | |
%81 = load i32, i32* %16, align 4 | |
%82 = add nsw i32 %81, 16 | |
store i32 %82, i32* %19, align 4 | |
%83 = call i32 @_ZN5Eigen6numext4miniIiEET_RKS2_S4_(i32* dereferenceable(4) %19, i32* dereferenceable(4) %6) #10 | |
store i32 %83, i32* %18, align 4 | |
%84 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %84) #9 | |
%85 = load i32, i32* %16, align 4 | |
store i32 %85, i32* %20, align 4 | |
br label %86 | |
; <label>:86: ; preds = %103, %68 | |
%87 = load i32, i32* %20, align 4 | |
%88 = load i32, i32* %18, align 4 | |
%89 = icmp slt i32 %87, %88 | |
br i1 %89, label %92, label %90 | |
; <label>:90: ; preds = %86 | |
store i32 8, i32* %14, align 4 | |
%91 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %91) #9 | |
br label %106 | |
; <label>:92: ; preds = %86 | |
%93 = bitcast float* %21 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %93) #9 | |
%94 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i32 0, i32 10 | |
%95 = load i32, i32* %20, align 4 | |
%96 = load i32, i32* %7, align 4 | |
%97 = mul nsw i32 %95, %96 | |
%98 = load i32, i32* %15, align 4 | |
%99 = add nsw i32 %97, %98 | |
%100 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %94, i32 %99) #10 | |
store float %100, float* %21, align 4 | |
%101 = load float, float* %21, align 4 | |
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %101, float* %17) #10 | |
%102 = bitcast float* %21 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %102) #9 | |
br label %103 | |
; <label>:103: ; preds = %92 | |
%104 = load i32, i32* %20, align 4 | |
%105 = add nsw i32 %104, 1 | |
store i32 %105, i32* %20, align 4 | |
br label %86 | |
; <label>:106: ; preds = %90 | |
%107 = load i32, i32* %15, align 4 | |
%108 = sext i32 %107 to i64 | |
%109 = load float*, float** %8, align 8 | |
%110 = getelementptr inbounds float, float* %109, i64 %108 | |
%111 = load float, float* %17, align 4 | |
call void @_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE(float* %110, float %111, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %0) #10 | |
%112 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %112) #9 | |
%113 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %113) #9 | |
%114 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %114) #9 | |
%115 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %115) #9 | |
br label %116 | |
; <label>:116: ; preds = %106 | |
%117 = load i32, i32* %9, align 4 | |
%118 = load i32, i32* %13, align 4 | |
%119 = add nsw i32 %118, %117 | |
store i32 %119, i32* %13, align 4 | |
br label %62 | |
; <label>:120: ; preds = %66 | |
%121 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %121) #9 | |
%122 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %122) #9 | |
%123 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %123) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.14"* byval align 8, i32) #0 comdat { | |
%3 = alloca i32, align 4 | |
%4 = alloca i32, align 4 | |
%5 = alloca i32, align 4 | |
%6 = alloca %"struct.Eigen::TensorEvaluator.14", align 8 | |
%7 = alloca i8, align 1 | |
%8 = alloca %"struct.Eigen::TensorEvaluator.14", align 8 | |
store i32 %1, i32* %3, align 4 | |
%9 = bitcast i32* %4 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
%10 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%11 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%12 = mul i32 %10, %11 | |
%13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%14 = add i32 %12, %13 | |
store i32 %14, i32* %4, align 4 | |
%15 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %15) #9 | |
%16 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%17 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%18 = mul i32 %16, %17 | |
store i32 %18, i32* %5, align 4 | |
%19 = bitcast %"struct.Eigen::TensorEvaluator.14"* %6 to i8* | |
call void @llvm.lifetime.start(i64 128, i8* %19) #9 | |
%20 = bitcast %"struct.Eigen::TensorEvaluator.14"* %6 to i8* | |
%21 = bitcast %"struct.Eigen::TensorEvaluator.14"* %0 to i8* | |
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %20, i8* %21, i64 128, i32 8, i1 false) | |
call void @llvm.lifetime.start(i64 1, i8* %7) #9 | |
store i8 0, i8* %7, align 1 | |
%22 = bitcast %"struct.Eigen::TensorEvaluator.14"* %8 to i8* | |
%23 = bitcast %"struct.Eigen::TensorEvaluator.14"* %6 to i8* | |
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %22, i8* %23, i64 128, i32 8, i1 false) | |
%24 = load i32, i32* %4, align 4 | |
%25 = load i32, i32* %3, align 4 | |
%26 = load i32, i32* %5, align 4 | |
call void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii(%"struct.Eigen::TensorEvaluator.14"* byval align 8 %8, i32 %24, i32 %25, i32 %26) #10 | |
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEED1Ev(%"struct.Eigen::TensorEvaluator.14"* %8) #5 | |
call void @llvm.lifetime.end(i64 1, i8* %7) #9 | |
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEED1Ev(%"struct.Eigen::TensorEvaluator.14"* %6) #5 | |
%27 = bitcast %"struct.Eigen::TensorEvaluator.14"* %6 to i8* | |
call void @llvm.lifetime.end(i64 128, i8* %27) #9 | |
%28 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %28) #9 | |
%29 = bitcast i32* %4 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %29) #9 | |
ret void | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii(%"struct.Eigen::TensorEvaluator.14"* byval align 8, i32, i32, i32) #2 comdat align 2 { | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
store i32 %1, i32* %5, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%9 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
%10 = load i32, i32* %5, align 4 | |
store i32 %10, i32* %8, align 4 | |
br label %11 | |
; <label>:11: ; preds = %19, %4 | |
%12 = load i32, i32* %8, align 4 | |
%13 = load i32, i32* %6, align 4 | |
%14 = icmp slt i32 %12, %13 | |
br i1 %14, label %17, label %15 | |
; <label>:15: ; preds = %11 | |
%16 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %16) #9 | |
br label %23 | |
; <label>:17: ; preds = %11 | |
%18 = load i32, i32* %8, align 4 | |
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.14"* %0, i32 %18) #10 | |
br label %19 | |
; <label>:19: ; preds = %17 | |
%20 = load i32, i32* %7, align 4 | |
%21 = load i32, i32* %8, align 4 | |
%22 = add nsw i32 %21, %20 | |
store i32 %22, i32* %8, align 4 | |
br label %11 | |
; <label>:23: ; preds = %15 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEED1Ev(%"struct.Eigen::TensorEvaluator.14"*) unnamed_addr #4 comdat align 2 { | |
%2 = alloca %"struct.Eigen::TensorEvaluator.14"*, align 8 | |
store %"struct.Eigen::TensorEvaluator.14"* %0, %"struct.Eigen::TensorEvaluator.14"** %2, align 8 | |
%3 = load %"struct.Eigen::TensorEvaluator.14"*, %"struct.Eigen::TensorEvaluator.14"** %2, align 8 | |
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEED2Ev(%"struct.Eigen::TensorEvaluator.14"* %3) #5 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.14"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator.14"*, align 8 | |
%4 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator.14"* %0, %"struct.Eigen::TensorEvaluator.14"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%5 = load %"struct.Eigen::TensorEvaluator.14"*, %"struct.Eigen::TensorEvaluator.14"** %3, align 8 | |
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %5, i32 0, i32 0 | |
%7 = load i32, i32* %4, align 4 | |
%8 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.12"* %6, i32 %7) #10 | |
%9 = load i32, i32* %4, align 4 | |
%10 = sext i32 %9 to i64 | |
%11 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %5, i32 0, i32 2 | |
%12 = load float*, float** %11, align 8 | |
%13 = getelementptr inbounds float, float* %12, i64 %10 | |
store float %8, float* %13, align 4 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.12"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator.12"*, align 8 | |
%4 = alloca i32, align 4 | |
%5 = alloca %"struct.Eigen::internal::SumReducer", align 1 | |
%6 = alloca float, align 4 | |
store %"struct.Eigen::TensorEvaluator.12"* %0, %"struct.Eigen::TensorEvaluator.12"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%7 = load %"struct.Eigen::TensorEvaluator.12"*, %"struct.Eigen::TensorEvaluator.12"** %3, align 8 | |
%8 = bitcast %"struct.Eigen::internal::SumReducer"* %5 to i8* | |
call void @llvm.lifetime.start(i64 1, i8* %8) #9 | |
%9 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %7, i32 0, i32 11 | |
%10 = bitcast float* %6 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %10) #9 | |
%11 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %5) #10 | |
store float %11, float* %6, align 4 | |
%12 = load i32, i32* %4, align 4 | |
%13 = call i32 @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEE10firstInputEi(%"struct.Eigen::TensorEvaluator.12"* %7, i32 %12) #10 | |
call void @_ZN5Eigen8internal17GenericDimReducerILi0ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator.12"* dereferenceable(112) %7, i32 %13, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %5, float* %6) #10 | |
%14 = load float, float* %6, align 4 | |
%15 = call float @_ZNK5Eigen8internal10SumReducerIfE8finalizeEf(%"struct.Eigen::internal::SumReducer"* %5, float %14) #10 | |
%16 = bitcast float* %6 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %16) #9 | |
%17 = bitcast %"struct.Eigen::internal::SumReducer"* %5 to i8* | |
call void @llvm.lifetime.end(i64 1, i8* %17) #9 | |
ret float %15 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen8internal17GenericDimReducerILi0ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator.12"* dereferenceable(112), i32, %"struct.Eigen::internal::SumReducer"* dereferenceable(1), float*) #4 comdat align 2 { | |
%5 = alloca %"struct.Eigen::TensorEvaluator.12"*, align 8 | |
%6 = alloca i32, align 4 | |
%7 = alloca %"struct.Eigen::internal::SumReducer"*, align 8 | |
%8 = alloca float*, align 8 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator.12"* %0, %"struct.Eigen::TensorEvaluator.12"** %5, align 8 | |
store i32 %1, i32* %6, align 4 | |
store %"struct.Eigen::internal::SumReducer"* %2, %"struct.Eigen::internal::SumReducer"** %7, align 8 | |
store float* %3, float** %8, align 8 | |
%11 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %11) #9 | |
store i32 0, i32* %9, align 4 | |
br label %12 | |
; <label>:12: ; preds = %36, %4 | |
%13 = load i32, i32* %9, align 4 | |
%14 = load %"struct.Eigen::TensorEvaluator.12"*, %"struct.Eigen::TensorEvaluator.12"** %5, align 8 | |
%15 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %14, i32 0, i32 9 | |
%16 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %15, i64 0) #10 | |
%17 = load i32, i32* %16, align 4 | |
%18 = icmp slt i32 %13, %17 | |
br i1 %18, label %21, label %19 | |
; <label>:19: ; preds = %12 | |
%20 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %20) #9 | |
br label %39 | |
; <label>:21: ; preds = %12 | |
%22 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %22) #9 | |
%23 = load i32, i32* %6, align 4 | |
%24 = load i32, i32* %9, align 4 | |
%25 = load %"struct.Eigen::TensorEvaluator.12"*, %"struct.Eigen::TensorEvaluator.12"** %5, align 8 | |
%26 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %25, i32 0, i32 8 | |
%27 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %26, i64 0) #10 | |
%28 = load i32, i32* %27, align 4 | |
%29 = mul nsw i32 %24, %28 | |
%30 = add nsw i32 %23, %29 | |
store i32 %30, i32* %10, align 4 | |
%31 = load %"struct.Eigen::TensorEvaluator.12"*, %"struct.Eigen::TensorEvaluator.12"** %5, align 8 | |
%32 = load i32, i32* %10, align 4 | |
%33 = load %"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::internal::SumReducer"** %7, align 8 | |
%34 = load float*, float** %8, align 8 | |
call void @_ZN5Eigen8internal17GenericDimReducerILin1ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator.12"* dereferenceable(112) %31, i32 %32, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %33, float* %34) #10 | |
%35 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %35) #9 | |
br label %36 | |
; <label>:36: ; preds = %21 | |
%37 = load i32, i32* %9, align 4 | |
%38 = add nsw i32 %37, 1 | |
store i32 %38, i32* %9, align 4 | |
br label %12 | |
; <label>:39: ; preds = %19 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr i32 @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEE10firstInputEi(%"struct.Eigen::TensorEvaluator.12"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator.12"*, align 8 | |
%4 = alloca i32, align 4 | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator.12"* %0, %"struct.Eigen::TensorEvaluator.12"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%8 = load %"struct.Eigen::TensorEvaluator.12"*, %"struct.Eigen::TensorEvaluator.12"** %3, align 8 | |
%9 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
store i32 0, i32* %5, align 4 | |
%10 = bitcast i32* %6 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %10) #9 | |
store i32 0, i32* %6, align 4 | |
br label %11 | |
; <label>:11: ; preds = %42, %2 | |
%12 = load i32, i32* %6, align 4 | |
%13 = icmp sgt i32 %12, 0 | |
br i1 %13, label %16, label %14 | |
; <label>:14: ; preds = %11 | |
%15 = bitcast i32* %6 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %15) #9 | |
br label %45 | |
; <label>:16: ; preds = %11 | |
%17 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %17) #9 | |
%18 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %8, i32 0, i32 4 | |
%19 = load i32, i32* %6, align 4 | |
%20 = sext i32 %19 to i64 | |
%21 = call dereferenceable(12) %"struct.Eigen::internal::TensorIntDivisor"* @_ZNK5Eigen5arrayINS_8internal16TensorIntDivisorIiLb0EEELm1EEixEm(%"class.Eigen::array.2"* %18, i64 %20) #10 | |
%22 = call i32 @_ZN5Eigen8internaldvIiLb0EEET_RKS2_RKNS0_16TensorIntDivisorIS2_XT0_EEE(i32* dereferenceable(4) %4, %"struct.Eigen::internal::TensorIntDivisor"* dereferenceable(12) %21) #10 | |
store i32 %22, i32* %7, align 4 | |
%23 = load i32, i32* %7, align 4 | |
%24 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %8, i32 0, i32 5 | |
%25 = load i32, i32* %6, align 4 | |
%26 = sext i32 %25 to i64 | |
%27 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %24, i64 %26) #10 | |
%28 = load i32, i32* %27, align 4 | |
%29 = mul nsw i32 %23, %28 | |
%30 = load i32, i32* %5, align 4 | |
%31 = add nsw i32 %30, %29 | |
store i32 %31, i32* %5, align 4 | |
%32 = load i32, i32* %7, align 4 | |
%33 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %8, i32 0, i32 3 | |
%34 = load i32, i32* %6, align 4 | |
%35 = sext i32 %34 to i64 | |
%36 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %33, i64 %35) #10 | |
%37 = load i32, i32* %36, align 4 | |
%38 = mul nsw i32 %32, %37 | |
%39 = load i32, i32* %4, align 4 | |
%40 = sub nsw i32 %39, %38 | |
store i32 %40, i32* %4, align 4 | |
%41 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %41) #9 | |
br label %42 | |
; <label>:42: ; preds = %16 | |
%43 = load i32, i32* %6, align 4 | |
%44 = add nsw i32 %43, -1 | |
store i32 %44, i32* %6, align 4 | |
br label %11 | |
; <label>:45: ; preds = %14 | |
%46 = load i32, i32* %4, align 4 | |
%47 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %8, i32 0, i32 7 | |
%48 = load i32, i32* %47, align 8 | |
%49 = mul nsw i32 %46, %48 | |
%50 = load i32, i32* %5, align 4 | |
%51 = add nsw i32 %50, %49 | |
store i32 %51, i32* %5, align 4 | |
%52 = load i32, i32* %5, align 4 | |
%53 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %53) #9 | |
ret i32 %52 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen8internal17GenericDimReducerILin1ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator.12"* dereferenceable(112), i32, %"struct.Eigen::internal::SumReducer"* dereferenceable(1), float*) #4 comdat align 2 { | |
%5 = alloca %"struct.Eigen::TensorEvaluator.12"*, align 8 | |
%6 = alloca i32, align 4 | |
%7 = alloca %"struct.Eigen::internal::SumReducer"*, align 8 | |
%8 = alloca float*, align 8 | |
store %"struct.Eigen::TensorEvaluator.12"* %0, %"struct.Eigen::TensorEvaluator.12"** %5, align 8 | |
store i32 %1, i32* %6, align 4 | |
store %"struct.Eigen::internal::SumReducer"* %2, %"struct.Eigen::internal::SumReducer"** %7, align 8 | |
store float* %3, float** %8, align 8 | |
%9 = load %"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::internal::SumReducer"** %7, align 8 | |
%10 = load %"struct.Eigen::TensorEvaluator.12"*, %"struct.Eigen::TensorEvaluator.12"** %5, align 8 | |
%11 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %10, i32 0, i32 10 | |
%12 = load i32, i32* %6, align 4 | |
%13 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %11, i32 %12) #10 | |
%14 = load float*, float** %8, align 8 | |
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %9, float %13, float* %14) #10 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEED2Ev(%"struct.Eigen::TensorEvaluator.14"*) unnamed_addr #4 comdat align 2 { | |
%2 = alloca %"struct.Eigen::TensorEvaluator.14"*, align 8 | |
store %"struct.Eigen::TensorEvaluator.14"* %0, %"struct.Eigen::TensorEvaluator.14"** %2, align 8 | |
%3 = load %"struct.Eigen::TensorEvaluator.14"*, %"struct.Eigen::TensorEvaluator.14"** %2, align 8 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32, align 4 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32 | |
%15 = alloca i32, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca i32, align 4 | |
%18 = alloca i32, align 4 | |
%19 = alloca float, align 4 | |
%20 = alloca i32, align 4 | |
%21 = alloca i32, align 4 | |
%22 = alloca i32, align 4 | |
%23 = alloca i32, align 4 | |
%24 = alloca i32, align 4 | |
%25 = alloca i32, align 4 | |
%26 = alloca i32, align 4 | |
%27 = alloca i32, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%29 = icmp eq i32 %28, 256 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %5 | |
br label %32 | |
; <label>:31: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%34 = icmp eq i32 %33, 1 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%44 = icmp eq i32 %43, 32 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%49 = icmp eq i32 %48, 1 | |
br i1 %49, label %50, label %51 | |
; <label>:50: ; preds = %47 | |
br label %52 | |
; <label>:51: ; preds = %47 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %52 | |
; <label>:52: ; preds = %51, %50 | |
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%54 = icmp eq i32 %53, 1 | |
br i1 %54, label %55, label %56 | |
; <label>:55: ; preds = %52 | |
br label %57 | |
; <label>:56: ; preds = %52 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %57 | |
; <label>:57: ; preds = %56, %55 | |
%58 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %58) #9 | |
store i32 16, i32* %8, align 4 | |
%59 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %7, align 4 | |
%61 = add nsw i32 %60, 32768 | |
%62 = sub nsw i32 %61, 1 | |
%63 = sdiv i32 %62, 32768 | |
store i32 %63, i32* %9, align 4 | |
%64 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %64) #9 | |
%65 = load i32, i32* %9, align 4 | |
%66 = load i32, i32* %6, align 4 | |
%67 = mul nsw i32 %65, %66 | |
store i32 %67, i32* %10, align 4 | |
%68 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %68) #9 | |
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %69, i32* %11, align 4 | |
%70 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %70) #9 | |
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %71, i32* %12, align 4 | |
%72 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %72) #9 | |
%73 = load i32, i32* %11, align 4 | |
store i32 %73, i32* %13, align 4 | |
br label %74 | |
; <label>:74: ; preds = %237, %57 | |
%75 = load i32, i32* %13, align 4 | |
%76 = load i32, i32* %10, align 4 | |
%77 = icmp slt i32 %75, %76 | |
br i1 %77, label %80, label %78 | |
; <label>:78: ; preds = %74 | |
store i32 2, i32* %14, align 4 | |
%79 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %79) #9 | |
br label %240 | |
; <label>:80: ; preds = %74 | |
%81 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %81) #9 | |
%82 = load i32, i32* %13, align 4 | |
%83 = load i32, i32* %9, align 4 | |
%84 = srem i32 %82, %83 | |
store i32 %84, i32* %15, align 4 | |
%85 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = load i32, i32* %13, align 4 | |
%87 = load i32, i32* %9, align 4 | |
%88 = sdiv i32 %86, %87 | |
store i32 %88, i32* %16, align 4 | |
%89 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %89) #9 | |
%90 = load i32, i32* %15, align 4 | |
%91 = mul nsw i32 %90, 256 | |
%92 = mul nsw i32 %91, 128 | |
%93 = load i32, i32* %12, align 4 | |
%94 = add nsw i32 %92, %93 | |
store i32 %94, i32* %17, align 4 | |
%95 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %95) #9 | |
%96 = load i32, i32* %16, align 4 | |
store i32 %96, i32* %18, align 4 | |
%97 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %97) #9 | |
%98 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%99 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %98) #10 | |
store float %99, float* %19, align 4 | |
%100 = load i32, i32* %18, align 4 | |
%101 = load i32, i32* %6, align 4 | |
%102 = icmp slt i32 %100, %101 | |
br i1 %102, label %103, label %198 | |
; <label>:103: ; preds = %80 | |
%104 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %104) #9 | |
store i32 0, i32* %20, align 4 | |
br label %105 | |
; <label>:105: ; preds = %192, %103 | |
%106 = load i32, i32* %20, align 4 | |
%107 = icmp slt i32 %106, 128 | |
br i1 %107, label %109, label %108 | |
; <label>:108: ; preds = %105 | |
store i32 5, i32* %14, align 4 | |
br label %195 | |
; <label>:109: ; preds = %105 | |
%110 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %110) #9 | |
%111 = load i32, i32* %17, align 4 | |
%112 = load i32, i32* %20, align 4 | |
%113 = add nsw i32 %112, 16 | |
%114 = sub nsw i32 %113, 1 | |
%115 = mul nsw i32 256, %114 | |
%116 = add nsw i32 %111, %115 | |
store i32 %116, i32* %21, align 4 | |
%117 = load i32, i32* %21, align 4 | |
%118 = load i32, i32* %7, align 4 | |
%119 = icmp sge i32 %117, %118 | |
br i1 %119, label %120, label %158 | |
; <label>:120: ; preds = %109 | |
%121 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %121) #9 | |
store i32 0, i32* %22, align 4 | |
br label %122 | |
; <label>:122: ; preds = %152, %120 | |
%123 = load i32, i32* %22, align 4 | |
%124 = icmp slt i32 %123, 15 | |
br i1 %124, label %126, label %125 | |
; <label>:125: ; preds = %122 | |
store i32 8, i32* %14, align 4 | |
br label %155 | |
; <label>:126: ; preds = %122 | |
%127 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %127) #9 | |
%128 = load i32, i32* %17, align 4 | |
%129 = load i32, i32* %20, align 4 | |
%130 = load i32, i32* %22, align 4 | |
%131 = add nsw i32 %129, %130 | |
%132 = mul nsw i32 256, %131 | |
%133 = add nsw i32 %128, %132 | |
store i32 %133, i32* %23, align 4 | |
%134 = load i32, i32* %23, align 4 | |
%135 = load i32, i32* %7, align 4 | |
%136 = icmp sge i32 %134, %135 | |
br i1 %136, label %137, label %138 | |
; <label>:137: ; preds = %126 | |
store i32 8, i32* %14, align 4 | |
br label %148 | |
; <label>:138: ; preds = %126 | |
%139 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%140 = load float, float* %19, align 4 | |
%141 = load i32, i32* %18, align 4 | |
%142 = load i32, i32* %7, align 4 | |
%143 = mul nsw i32 %141, %142 | |
%144 = load i32, i32* %23, align 4 | |
%145 = add nsw i32 %143, %144 | |
%146 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %145) #10 | |
%147 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %139, float %140, float %146) #10 | |
store float %147, float* %19, align 4 | |
store i32 0, i32* %14, align 4 | |
br label %148 | |
; <label>:148: ; preds = %138, %137 | |
%149 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %149) #9 | |
%150 = load i32, i32* %14, align 4 | |
switch i32 %150, label %155 [ | |
i32 0, label %151 | |
] | |
; <label>:151: ; preds = %148 | |
br label %152 | |
; <label>:152: ; preds = %151 | |
%153 = load i32, i32* %22, align 4 | |
%154 = add nsw i32 %153, 1 | |
store i32 %154, i32* %22, align 4 | |
br label %122, !llvm.loop !72 | |
; <label>:155: ; preds = %148, %125 | |
%156 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %156) #9 | |
br label %157 | |
; <label>:157: ; preds = %155 | |
store i32 5, i32* %14, align 4 | |
br label %188 | |
; <label>:158: ; preds = %109 | |
%159 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %159) #9 | |
store i32 0, i32* %24, align 4 | |
br label %160 | |
; <label>:160: ; preds = %183, %158 | |
%161 = load i32, i32* %24, align 4 | |
%162 = icmp slt i32 %161, 16 | |
br i1 %162, label %165, label %163 | |
; <label>:163: ; preds = %160 | |
store i32 11, i32* %14, align 4 | |
%164 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %164) #9 | |
br label %186 | |
; <label>:165: ; preds = %160 | |
%166 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %166) #9 | |
%167 = load i32, i32* %17, align 4 | |
%168 = load i32, i32* %20, align 4 | |
%169 = load i32, i32* %24, align 4 | |
%170 = add nsw i32 %168, %169 | |
%171 = mul nsw i32 256, %170 | |
%172 = add nsw i32 %167, %171 | |
store i32 %172, i32* %25, align 4 | |
%173 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%174 = load float, float* %19, align 4 | |
%175 = load i32, i32* %18, align 4 | |
%176 = load i32, i32* %7, align 4 | |
%177 = mul nsw i32 %175, %176 | |
%178 = load i32, i32* %25, align 4 | |
%179 = add nsw i32 %177, %178 | |
%180 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %179) #10 | |
%181 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %173, float %174, float %180) #10 | |
store float %181, float* %19, align 4 | |
%182 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %182) #9 | |
br label %183 | |
; <label>:183: ; preds = %165 | |
%184 = load i32, i32* %24, align 4 | |
%185 = add nsw i32 %184, 1 | |
store i32 %185, i32* %24, align 4 | |
br label %160, !llvm.loop !73 | |
; <label>:186: ; preds = %163 | |
br label %187 | |
; <label>:187: ; preds = %186 | |
store i32 0, i32* %14, align 4 | |
br label %188 | |
; <label>:188: ; preds = %187, %157 | |
%189 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %189) #9 | |
%190 = load i32, i32* %14, align 4 | |
switch i32 %190, label %195 [ | |
i32 0, label %191 | |
] | |
; <label>:191: ; preds = %188 | |
br label %192 | |
; <label>:192: ; preds = %191 | |
%193 = load i32, i32* %20, align 4 | |
%194 = add nsw i32 %193, 16 | |
store i32 %194, i32* %20, align 4 | |
br label %105, !llvm.loop !74 | |
; <label>:195: ; preds = %188, %108 | |
%196 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %196) #9 | |
br label %197 | |
; <label>:197: ; preds = %195 | |
br label %198 | |
; <label>:198: ; preds = %197, %80 | |
%199 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %199) #9 | |
store i32 16, i32* %26, align 4 | |
br label %200 | |
; <label>:200: ; preds = %212, %198 | |
%201 = load i32, i32* %26, align 4 | |
%202 = icmp sgt i32 %201, 0 | |
br i1 %202, label %205, label %203 | |
; <label>:203: ; preds = %200 | |
store i32 14, i32* %14, align 4 | |
%204 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %204) #9 | |
br label %215 | |
; <label>:205: ; preds = %200 | |
%206 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%207 = load float, float* %19, align 4 | |
%208 = load i32, i32* %26, align 4 | |
%209 = call float @_ZL11__shfl_downfji(float %207, i32 %208, i32 32) #10 | |
%210 = load float, float* %19, align 4 | |
%211 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %206, float %209, float %210) #10 | |
store float %211, float* %19, align 4 | |
br label %212 | |
; <label>:212: ; preds = %205 | |
%213 = load i32, i32* %26, align 4 | |
%214 = sdiv i32 %213, 2 | |
store i32 %214, i32* %26, align 4 | |
br label %200, !llvm.loop !75 | |
; <label>:215: ; preds = %203 | |
%216 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %216) #9 | |
%217 = load i32, i32* %12, align 4 | |
%218 = and i32 %217, 31 | |
store i32 %218, i32* %27, align 4 | |
%219 = load i32, i32* %27, align 4 | |
%220 = icmp eq i32 %219, 0 | |
br i1 %220, label %221, label %230 | |
; <label>:221: ; preds = %215 | |
%222 = load i32, i32* %18, align 4 | |
%223 = load i32, i32* %6, align 4 | |
%224 = icmp slt i32 %222, %223 | |
br i1 %224, label %225, label %230 | |
; <label>:225: ; preds = %221 | |
%226 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%227 = load i32, i32* %18, align 4 | |
%228 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %227) #10 | |
%229 = load float, float* %19, align 4 | |
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %226, float* %228, float %229) #10 | |
br label %230 | |
; <label>:230: ; preds = %225, %221, %215 | |
%231 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %231) #9 | |
%232 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %232) #9 | |
%233 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %233) #9 | |
%234 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %234) #9 | |
%235 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %235) #9 | |
%236 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %236) #9 | |
br label %237 | |
; <label>:237: ; preds = %230 | |
%238 = load i32, i32* %13, align 4 | |
%239 = add nsw i32 %238, 32 | |
store i32 %239, i32* %13, align 4 | |
br label %74 | |
; <label>:240: ; preds = %78 | |
%241 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %241) #9 | |
%242 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %242) #9 | |
%243 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %243) #9 | |
%244 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %244) #9 | |
%245 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %245) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32, align 4 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32 | |
%15 = alloca i32, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca i32, align 4 | |
%18 = alloca i32, align 4 | |
%19 = alloca float, align 4 | |
%20 = alloca i32, align 4 | |
%21 = alloca i32, align 4 | |
%22 = alloca i32, align 4 | |
%23 = alloca i32, align 4 | |
%24 = alloca i32, align 4 | |
%25 = alloca i32, align 4 | |
%26 = alloca i32, align 4 | |
%27 = alloca i32, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%29 = icmp eq i32 %28, 256 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %5 | |
br label %32 | |
; <label>:31: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%34 = icmp eq i32 %33, 1 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%44 = icmp eq i32 %43, 32 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%49 = icmp eq i32 %48, 1 | |
br i1 %49, label %50, label %51 | |
; <label>:50: ; preds = %47 | |
br label %52 | |
; <label>:51: ; preds = %47 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %52 | |
; <label>:52: ; preds = %51, %50 | |
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%54 = icmp eq i32 %53, 1 | |
br i1 %54, label %55, label %56 | |
; <label>:55: ; preds = %52 | |
br label %57 | |
; <label>:56: ; preds = %52 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %57 | |
; <label>:57: ; preds = %56, %55 | |
%58 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %58) #9 | |
store i32 16, i32* %8, align 4 | |
%59 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %7, align 4 | |
%61 = add nsw i32 %60, 32768 | |
%62 = sub nsw i32 %61, 1 | |
%63 = sdiv i32 %62, 32768 | |
store i32 %63, i32* %9, align 4 | |
%64 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %64) #9 | |
%65 = load i32, i32* %9, align 4 | |
%66 = load i32, i32* %6, align 4 | |
%67 = mul nsw i32 %65, %66 | |
store i32 %67, i32* %10, align 4 | |
%68 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %68) #9 | |
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %69, i32* %11, align 4 | |
%70 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %70) #9 | |
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %71, i32* %12, align 4 | |
%72 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %72) #9 | |
%73 = load i32, i32* %11, align 4 | |
store i32 %73, i32* %13, align 4 | |
br label %74 | |
; <label>:74: ; preds = %232, %57 | |
%75 = load i32, i32* %13, align 4 | |
%76 = load i32, i32* %10, align 4 | |
%77 = icmp slt i32 %75, %76 | |
br i1 %77, label %80, label %78 | |
; <label>:78: ; preds = %74 | |
store i32 2, i32* %14, align 4 | |
%79 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %79) #9 | |
br label %235 | |
; <label>:80: ; preds = %74 | |
%81 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %81) #9 | |
%82 = load i32, i32* %13, align 4 | |
%83 = load i32, i32* %9, align 4 | |
%84 = srem i32 %82, %83 | |
store i32 %84, i32* %15, align 4 | |
%85 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = load i32, i32* %13, align 4 | |
%87 = load i32, i32* %9, align 4 | |
%88 = sdiv i32 %86, %87 | |
store i32 %88, i32* %16, align 4 | |
%89 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %89) #9 | |
%90 = load i32, i32* %15, align 4 | |
%91 = mul nsw i32 %90, 256 | |
%92 = mul nsw i32 %91, 128 | |
%93 = load i32, i32* %12, align 4 | |
%94 = add nsw i32 %92, %93 | |
store i32 %94, i32* %17, align 4 | |
%95 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %95) #9 | |
%96 = load i32, i32* %16, align 4 | |
store i32 %96, i32* %18, align 4 | |
%97 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %97) #9 | |
%98 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10 | |
store float %98, float* %19, align 4 | |
%99 = load i32, i32* %18, align 4 | |
%100 = load i32, i32* %6, align 4 | |
%101 = icmp slt i32 %99, %100 | |
br i1 %101, label %102, label %195 | |
; <label>:102: ; preds = %80 | |
%103 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %103) #9 | |
store i32 0, i32* %20, align 4 | |
br label %104 | |
; <label>:104: ; preds = %189, %102 | |
%105 = load i32, i32* %20, align 4 | |
%106 = icmp slt i32 %105, 128 | |
br i1 %106, label %108, label %107 | |
; <label>:107: ; preds = %104 | |
store i32 5, i32* %14, align 4 | |
br label %192 | |
; <label>:108: ; preds = %104 | |
%109 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %109) #9 | |
%110 = load i32, i32* %17, align 4 | |
%111 = load i32, i32* %20, align 4 | |
%112 = add nsw i32 %111, 16 | |
%113 = sub nsw i32 %112, 1 | |
%114 = mul nsw i32 256, %113 | |
%115 = add nsw i32 %110, %114 | |
store i32 %115, i32* %21, align 4 | |
%116 = load i32, i32* %21, align 4 | |
%117 = load i32, i32* %7, align 4 | |
%118 = icmp sge i32 %116, %117 | |
br i1 %118, label %119, label %156 | |
; <label>:119: ; preds = %108 | |
%120 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %120) #9 | |
store i32 0, i32* %22, align 4 | |
br label %121 | |
; <label>:121: ; preds = %150, %119 | |
%122 = load i32, i32* %22, align 4 | |
%123 = icmp slt i32 %122, 15 | |
br i1 %123, label %125, label %124 | |
; <label>:124: ; preds = %121 | |
store i32 8, i32* %14, align 4 | |
br label %153 | |
; <label>:125: ; preds = %121 | |
%126 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %126) #9 | |
%127 = load i32, i32* %17, align 4 | |
%128 = load i32, i32* %20, align 4 | |
%129 = load i32, i32* %22, align 4 | |
%130 = add nsw i32 %128, %129 | |
%131 = mul nsw i32 256, %130 | |
%132 = add nsw i32 %127, %131 | |
store i32 %132, i32* %23, align 4 | |
%133 = load i32, i32* %23, align 4 | |
%134 = load i32, i32* %7, align 4 | |
%135 = icmp sge i32 %133, %134 | |
br i1 %135, label %136, label %137 | |
; <label>:136: ; preds = %125 | |
store i32 8, i32* %14, align 4 | |
br label %146 | |
; <label>:137: ; preds = %125 | |
%138 = load float, float* %19, align 4 | |
%139 = load i32, i32* %18, align 4 | |
%140 = load i32, i32* %7, align 4 | |
%141 = mul nsw i32 %139, %140 | |
%142 = load i32, i32* %23, align 4 | |
%143 = add nsw i32 %141, %142 | |
%144 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %143) #10 | |
%145 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %138, float %144) #10 | |
store float %145, float* %19, align 4 | |
store i32 0, i32* %14, align 4 | |
br label %146 | |
; <label>:146: ; preds = %137, %136 | |
%147 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %147) #9 | |
%148 = load i32, i32* %14, align 4 | |
switch i32 %148, label %153 [ | |
i32 0, label %149 | |
] | |
; <label>:149: ; preds = %146 | |
br label %150 | |
; <label>:150: ; preds = %149 | |
%151 = load i32, i32* %22, align 4 | |
%152 = add nsw i32 %151, 1 | |
store i32 %152, i32* %22, align 4 | |
br label %121, !llvm.loop !76 | |
; <label>:153: ; preds = %146, %124 | |
%154 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %154) #9 | |
br label %155 | |
; <label>:155: ; preds = %153 | |
store i32 5, i32* %14, align 4 | |
br label %185 | |
; <label>:156: ; preds = %108 | |
%157 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %157) #9 | |
store i32 0, i32* %24, align 4 | |
br label %158 | |
; <label>:158: ; preds = %180, %156 | |
%159 = load i32, i32* %24, align 4 | |
%160 = icmp slt i32 %159, 16 | |
br i1 %160, label %163, label %161 | |
; <label>:161: ; preds = %158 | |
store i32 11, i32* %14, align 4 | |
%162 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %162) #9 | |
br label %183 | |
; <label>:163: ; preds = %158 | |
%164 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %164) #9 | |
%165 = load i32, i32* %17, align 4 | |
%166 = load i32, i32* %20, align 4 | |
%167 = load i32, i32* %24, align 4 | |
%168 = add nsw i32 %166, %167 | |
%169 = mul nsw i32 256, %168 | |
%170 = add nsw i32 %165, %169 | |
store i32 %170, i32* %25, align 4 | |
%171 = load float, float* %19, align 4 | |
%172 = load i32, i32* %18, align 4 | |
%173 = load i32, i32* %7, align 4 | |
%174 = mul nsw i32 %172, %173 | |
%175 = load i32, i32* %25, align 4 | |
%176 = add nsw i32 %174, %175 | |
%177 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %176) #10 | |
%178 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %171, float %177) #10 | |
store float %178, float* %19, align 4 | |
%179 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %179) #9 | |
br label %180 | |
; <label>:180: ; preds = %163 | |
%181 = load i32, i32* %24, align 4 | |
%182 = add nsw i32 %181, 1 | |
store i32 %182, i32* %24, align 4 | |
br label %158, !llvm.loop !77 | |
; <label>:183: ; preds = %161 | |
br label %184 | |
; <label>:184: ; preds = %183 | |
store i32 0, i32* %14, align 4 | |
br label %185 | |
; <label>:185: ; preds = %184, %155 | |
%186 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %186) #9 | |
%187 = load i32, i32* %14, align 4 | |
switch i32 %187, label %192 [ | |
i32 0, label %188 | |
] | |
; <label>:188: ; preds = %185 | |
br label %189 | |
; <label>:189: ; preds = %188 | |
%190 = load i32, i32* %20, align 4 | |
%191 = add nsw i32 %190, 16 | |
store i32 %191, i32* %20, align 4 | |
br label %104, !llvm.loop !78 | |
; <label>:192: ; preds = %185, %107 | |
%193 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %193) #9 | |
br label %194 | |
; <label>:194: ; preds = %192 | |
br label %195 | |
; <label>:195: ; preds = %194, %80 | |
%196 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %196) #9 | |
store i32 16, i32* %26, align 4 | |
br label %197 | |
; <label>:197: ; preds = %208, %195 | |
%198 = load i32, i32* %26, align 4 | |
%199 = icmp sgt i32 %198, 0 | |
br i1 %199, label %202, label %200 | |
; <label>:200: ; preds = %197 | |
store i32 14, i32* %14, align 4 | |
%201 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %201) #9 | |
br label %211 | |
; <label>:202: ; preds = %197 | |
%203 = load float, float* %19, align 4 | |
%204 = load i32, i32* %26, align 4 | |
%205 = call float @_ZL11__shfl_downfji(float %203, i32 %204, i32 32) #10 | |
%206 = load float, float* %19, align 4 | |
%207 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %205, float %206) #10 | |
store float %207, float* %19, align 4 | |
br label %208 | |
; <label>:208: ; preds = %202 | |
%209 = load i32, i32* %26, align 4 | |
%210 = sdiv i32 %209, 2 | |
store i32 %210, i32* %26, align 4 | |
br label %197, !llvm.loop !79 | |
; <label>:211: ; preds = %200 | |
%212 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %212) #9 | |
%213 = load i32, i32* %12, align 4 | |
%214 = and i32 %213, 31 | |
store i32 %214, i32* %27, align 4 | |
%215 = load i32, i32* %27, align 4 | |
%216 = icmp eq i32 %215, 0 | |
br i1 %216, label %217, label %225 | |
; <label>:217: ; preds = %211 | |
%218 = load i32, i32* %18, align 4 | |
%219 = load i32, i32* %6, align 4 | |
%220 = icmp slt i32 %218, %219 | |
br i1 %220, label %221, label %225 | |
; <label>:221: ; preds = %217 | |
%222 = load i32, i32* %18, align 4 | |
%223 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %222) #10 | |
%224 = load float, float* %19, align 4 | |
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %223, float %224) #10 | |
br label %225 | |
; <label>:225: ; preds = %221, %217, %211 | |
%226 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %226) #9 | |
%227 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %227) #9 | |
%228 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %228) #9 | |
%229 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %229) #9 | |
%230 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %230) #9 | |
%231 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %231) #9 | |
br label %232 | |
; <label>:232: ; preds = %225 | |
%233 = load i32, i32* %13, align 4 | |
%234 = add nsw i32 %233, 32 | |
store i32 %234, i32* %13, align 4 | |
br label %74 | |
; <label>:235: ; preds = %78 | |
%236 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %236) #9 | |
%237 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %237) #9 | |
%238 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %238) #9 | |
%239 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %239) #9 | |
%240 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %240) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32, align 4 | |
%15 = alloca float, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca float, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%19 = icmp eq i32 %18, 256 | |
br i1 %19, label %20, label %21 | |
; <label>:20: ; preds = %5 | |
br label %22 | |
; <label>:21: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %22 | |
; <label>:22: ; preds = %21, %20 | |
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%24 = icmp eq i32 %23, 1 | |
br i1 %24, label %25, label %26 | |
; <label>:25: ; preds = %22 | |
br label %27 | |
; <label>:26: ; preds = %22 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %27 | |
; <label>:27: ; preds = %26, %25 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%29 = icmp eq i32 %28, 1 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %27 | |
br label %32 | |
; <label>:31: ; preds = %27 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%34 = icmp eq i32 %33, 128 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%44 = icmp eq i32 %43, 1 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %48) #9 | |
%49 = load i32, i32* %6, align 4 | |
%50 = add nsw i32 %49, 16 | |
%51 = sub nsw i32 %50, 1 | |
%52 = sdiv i32 %51, 16 | |
%53 = load i32, i32* %7, align 4 | |
%54 = mul nsw i32 %52, %53 | |
store i32 %54, i32* %8, align 4 | |
%55 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %55) #9 | |
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %56, i32* %9, align 4 | |
%57 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %57) #9 | |
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %58, i32* %10, align 4 | |
%59 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %9, align 4 | |
%61 = mul nsw i32 %60, 256 | |
%62 = load i32, i32* %10, align 4 | |
%63 = add nsw i32 %61, %62 | |
store i32 %63, i32* %11, align 4 | |
br label %64 | |
; <label>:64: ; preds = %135, %47 | |
%65 = load i32, i32* %11, align 4 | |
%66 = load i32, i32* %8, align 4 | |
%67 = icmp slt i32 %65, %66 | |
br i1 %67, label %70, label %68 | |
; <label>:68: ; preds = %64 | |
store i32 2, i32* %12, align 4 | |
%69 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %69) #9 | |
br label %138 | |
; <label>:70: ; preds = %64 | |
%71 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %71) #9 | |
%72 = load i32, i32* %11, align 4 | |
%73 = load i32, i32* %7, align 4 | |
%74 = srem i32 %72, %73 | |
store i32 %74, i32* %13, align 4 | |
%75 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %75) #9 | |
%76 = load i32, i32* %11, align 4 | |
%77 = load i32, i32* %7, align 4 | |
%78 = sdiv i32 %76, %77 | |
%79 = load i32, i32* %6, align 4 | |
%80 = add nsw i32 %79, 16 | |
%81 = sub nsw i32 %80, 1 | |
%82 = sdiv i32 %81, 16 | |
%83 = srem i32 %78, %82 | |
%84 = mul nsw i32 %83, 16 | |
store i32 %84, i32* %14, align 4 | |
%85 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%87 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %86) #10 | |
store float %87, float* %15, align 4 | |
%88 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %88) #9 | |
store i32 0, i32* %16, align 4 | |
br label %89 | |
; <label>:89: ; preds = %124, %70 | |
%90 = load i32, i32* %16, align 4 | |
%91 = icmp slt i32 %90, 16 | |
br i1 %91, label %94, label %92 | |
; <label>:92: ; preds = %89 | |
store i32 5, i32* %12, align 4 | |
%93 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %93) #9 | |
br label %127 | |
; <label>:94: ; preds = %89 | |
%95 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %95) #9 | |
%96 = load i32, i32* %13, align 4 | |
%97 = load i32, i32* %7, align 4 | |
%98 = icmp slt i32 %96, %97 | |
br i1 %98, label %99, label %114 | |
; <label>:99: ; preds = %94 | |
%100 = load i32, i32* %14, align 4 | |
%101 = load i32, i32* %16, align 4 | |
%102 = add nsw i32 %100, %101 | |
%103 = load i32, i32* %6, align 4 | |
%104 = icmp slt i32 %102, %103 | |
br i1 %104, label %105, label %114 | |
; <label>:105: ; preds = %99 | |
%106 = load i32, i32* %14, align 4 | |
%107 = load i32, i32* %16, align 4 | |
%108 = add nsw i32 %106, %107 | |
%109 = load i32, i32* %7, align 4 | |
%110 = mul nsw i32 %108, %109 | |
%111 = load i32, i32* %13, align 4 | |
%112 = add nsw i32 %110, %111 | |
%113 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %112) #10 | |
br label %117 | |
; <label>:114: ; preds = %99, %94 | |
%115 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%116 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %115) #10 | |
br label %117 | |
; <label>:117: ; preds = %114, %105 | |
%118 = phi float [ %113, %105 ], [ %116, %114 ] | |
store float %118, float* %17, align 4 | |
%119 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%120 = load float, float* %15, align 4 | |
%121 = load float, float* %17, align 4 | |
%122 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %119, float %120, float %121) #10 | |
store float %122, float* %15, align 4 | |
%123 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %123) #9 | |
br label %124 | |
; <label>:124: ; preds = %117 | |
%125 = load i32, i32* %16, align 4 | |
%126 = add nsw i32 %125, 1 | |
store i32 %126, i32* %16, align 4 | |
br label %89 | |
; <label>:127: ; preds = %92 | |
%128 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%129 = load i32, i32* %13, align 4 | |
%130 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %129) #10 | |
%131 = load float, float* %15, align 4 | |
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %128, float* %130, float %131) #10 | |
%132 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %132) #9 | |
%133 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %133) #9 | |
%134 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %134) #9 | |
br label %135 | |
; <label>:135: ; preds = %127 | |
%136 = load i32, i32* %11, align 4 | |
%137 = add nsw i32 %136, 32768 | |
store i32 %137, i32* %11, align 4 | |
br label %64 | |
; <label>:138: ; preds = %68 | |
%139 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %139) #9 | |
%140 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %140) #9 | |
%141 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %141) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32, align 4 | |
%15 = alloca float, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca float, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%19 = icmp eq i32 %18, 256 | |
br i1 %19, label %20, label %21 | |
; <label>:20: ; preds = %5 | |
br label %22 | |
; <label>:21: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %22 | |
; <label>:22: ; preds = %21, %20 | |
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%24 = icmp eq i32 %23, 1 | |
br i1 %24, label %25, label %26 | |
; <label>:25: ; preds = %22 | |
br label %27 | |
; <label>:26: ; preds = %22 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %27 | |
; <label>:27: ; preds = %26, %25 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%29 = icmp eq i32 %28, 1 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %27 | |
br label %32 | |
; <label>:31: ; preds = %27 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%34 = icmp eq i32 %33, 128 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%44 = icmp eq i32 %43, 1 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %48) #9 | |
%49 = load i32, i32* %6, align 4 | |
%50 = add nsw i32 %49, 16 | |
%51 = sub nsw i32 %50, 1 | |
%52 = sdiv i32 %51, 16 | |
%53 = load i32, i32* %7, align 4 | |
%54 = mul nsw i32 %52, %53 | |
store i32 %54, i32* %8, align 4 | |
%55 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %55) #9 | |
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %56, i32* %9, align 4 | |
%57 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %57) #9 | |
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %58, i32* %10, align 4 | |
%59 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %9, align 4 | |
%61 = mul nsw i32 %60, 256 | |
%62 = load i32, i32* %10, align 4 | |
%63 = add nsw i32 %61, %62 | |
store i32 %63, i32* %11, align 4 | |
br label %64 | |
; <label>:64: ; preds = %131, %47 | |
%65 = load i32, i32* %11, align 4 | |
%66 = load i32, i32* %8, align 4 | |
%67 = icmp slt i32 %65, %66 | |
br i1 %67, label %70, label %68 | |
; <label>:68: ; preds = %64 | |
store i32 2, i32* %12, align 4 | |
%69 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %69) #9 | |
br label %134 | |
; <label>:70: ; preds = %64 | |
%71 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %71) #9 | |
%72 = load i32, i32* %11, align 4 | |
%73 = load i32, i32* %7, align 4 | |
%74 = srem i32 %72, %73 | |
store i32 %74, i32* %13, align 4 | |
%75 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %75) #9 | |
%76 = load i32, i32* %11, align 4 | |
%77 = load i32, i32* %7, align 4 | |
%78 = sdiv i32 %76, %77 | |
%79 = load i32, i32* %6, align 4 | |
%80 = add nsw i32 %79, 16 | |
%81 = sub nsw i32 %80, 1 | |
%82 = sdiv i32 %81, 16 | |
%83 = srem i32 %78, %82 | |
%84 = mul nsw i32 %83, 16 | |
store i32 %84, i32* %14, align 4 | |
%85 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10 | |
store float %86, float* %15, align 4 | |
%87 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %87) #9 | |
store i32 0, i32* %16, align 4 | |
br label %88 | |
; <label>:88: ; preds = %121, %70 | |
%89 = load i32, i32* %16, align 4 | |
%90 = icmp slt i32 %89, 16 | |
br i1 %90, label %93, label %91 | |
; <label>:91: ; preds = %88 | |
store i32 5, i32* %12, align 4 | |
%92 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %92) #9 | |
br label %124 | |
; <label>:93: ; preds = %88 | |
%94 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %94) #9 | |
%95 = load i32, i32* %13, align 4 | |
%96 = load i32, i32* %7, align 4 | |
%97 = icmp slt i32 %95, %96 | |
br i1 %97, label %98, label %113 | |
; <label>:98: ; preds = %93 | |
%99 = load i32, i32* %14, align 4 | |
%100 = load i32, i32* %16, align 4 | |
%101 = add nsw i32 %99, %100 | |
%102 = load i32, i32* %6, align 4 | |
%103 = icmp slt i32 %101, %102 | |
br i1 %103, label %104, label %113 | |
; <label>:104: ; preds = %98 | |
%105 = load i32, i32* %14, align 4 | |
%106 = load i32, i32* %16, align 4 | |
%107 = add nsw i32 %105, %106 | |
%108 = load i32, i32* %7, align 4 | |
%109 = mul nsw i32 %107, %108 | |
%110 = load i32, i32* %13, align 4 | |
%111 = add nsw i32 %109, %110 | |
%112 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %111) #10 | |
br label %115 | |
; <label>:113: ; preds = %98, %93 | |
%114 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10 | |
br label %115 | |
; <label>:115: ; preds = %113, %104 | |
%116 = phi float [ %112, %104 ], [ %114, %113 ] | |
store float %116, float* %17, align 4 | |
%117 = load float, float* %15, align 4 | |
%118 = load float, float* %17, align 4 | |
%119 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %117, float %118) #10 | |
store float %119, float* %15, align 4 | |
%120 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %120) #9 | |
br label %121 | |
; <label>:121: ; preds = %115 | |
%122 = load i32, i32* %16, align 4 | |
%123 = add nsw i32 %122, 1 | |
store i32 %123, i32* %16, align 4 | |
br label %88 | |
; <label>:124: ; preds = %91 | |
%125 = load i32, i32* %13, align 4 | |
%126 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %125) #10 | |
%127 = load float, float* %15, align 4 | |
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %126, float %127) #10 | |
%128 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %128) #9 | |
%129 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %129) #9 | |
%130 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %130) #9 | |
br label %131 | |
; <label>:131: ; preds = %124 | |
%132 = load i32, i32* %11, align 4 | |
%133 = add nsw i32 %132, 32768 | |
store i32 %133, i32* %11, align 4 | |
br label %64 | |
; <label>:134: ; preds = %68 | |
%135 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %135) #9 | |
%136 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %136) #9 | |
%137 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %137) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.15"* byval align 8, i32) #0 comdat { | |
%3 = alloca i32, align 4 | |
%4 = alloca i32, align 4 | |
%5 = alloca i32, align 4 | |
%6 = alloca %"struct.Eigen::TensorEvaluator.15", align 8 | |
%7 = alloca i8, align 1 | |
%8 = alloca %"struct.Eigen::TensorEvaluator.15", align 8 | |
store i32 %1, i32* %3, align 4 | |
%9 = bitcast i32* %4 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
%10 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%11 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%12 = mul i32 %10, %11 | |
%13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%14 = add i32 %12, %13 | |
store i32 %14, i32* %4, align 4 | |
%15 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %15) #9 | |
%16 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%17 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%18 = mul i32 %16, %17 | |
store i32 %18, i32* %5, align 4 | |
%19 = bitcast %"struct.Eigen::TensorEvaluator.15"* %6 to i8* | |
call void @llvm.lifetime.start(i64 168, i8* %19) #9 | |
%20 = bitcast %"struct.Eigen::TensorEvaluator.15"* %6 to i8* | |
%21 = bitcast %"struct.Eigen::TensorEvaluator.15"* %0 to i8* | |
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %20, i8* %21, i64 168, i32 8, i1 false) | |
call void @llvm.lifetime.start(i64 1, i8* %7) #9 | |
store i8 0, i8* %7, align 1 | |
%22 = bitcast %"struct.Eigen::TensorEvaluator.15"* %8 to i8* | |
%23 = bitcast %"struct.Eigen::TensorEvaluator.15"* %6 to i8* | |
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %22, i8* %23, i64 168, i32 8, i1 false) | |
%24 = load i32, i32* %4, align 4 | |
%25 = load i32, i32* %3, align 4 | |
%26 = load i32, i32* %5, align 4 | |
call void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii(%"struct.Eigen::TensorEvaluator.15"* byval align 8 %8, i32 %24, i32 %25, i32 %26) #10 | |
call void @llvm.lifetime.end(i64 1, i8* %7) #9 | |
%27 = bitcast %"struct.Eigen::TensorEvaluator.15"* %6 to i8* | |
call void @llvm.lifetime.end(i64 168, i8* %27) #9 | |
%28 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %28) #9 | |
%29 = bitcast i32* %4 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %29) #9 | |
ret void | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii(%"struct.Eigen::TensorEvaluator.15"* byval align 8, i32, i32, i32) #2 comdat align 2 { | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
store i32 %1, i32* %5, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%9 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
%10 = load i32, i32* %5, align 4 | |
store i32 %10, i32* %8, align 4 | |
br label %11 | |
; <label>:11: ; preds = %19, %4 | |
%12 = load i32, i32* %8, align 4 | |
%13 = load i32, i32* %6, align 4 | |
%14 = icmp slt i32 %12, %13 | |
br i1 %14, label %17, label %15 | |
; <label>:15: ; preds = %11 | |
%16 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %16) #9 | |
br label %23 | |
; <label>:17: ; preds = %11 | |
%18 = load i32, i32* %8, align 4 | |
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.15"* %0, i32 %18) #10 | |
br label %19 | |
; <label>:19: ; preds = %17 | |
%20 = load i32, i32* %7, align 4 | |
%21 = load i32, i32* %8, align 4 | |
%22 = add nsw i32 %21, %20 | |
store i32 %22, i32* %8, align 4 | |
br label %11 | |
; <label>:23: ; preds = %15 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.15"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator.15"*, align 8 | |
%4 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator.15"* %0, %"struct.Eigen::TensorEvaluator.15"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%5 = load %"struct.Eigen::TensorEvaluator.15"*, %"struct.Eigen::TensorEvaluator.15"** %3, align 8 | |
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.15", %"struct.Eigen::TensorEvaluator.15"* %5, i32 0, i32 1 | |
%7 = load i32, i32* %4, align 4 | |
%8 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.17"* %6, i32 %7) #10 | |
%9 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.15", %"struct.Eigen::TensorEvaluator.15"* %5, i32 0, i32 0 | |
%10 = load i32, i32* %4, align 4 | |
%11 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"* %9, i32 %10) #10 | |
store float %8, float* %11, align 4 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.17"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator.17"*, align 8 | |
%4 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator.17"* %0, %"struct.Eigen::TensorEvaluator.17"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%5 = load %"struct.Eigen::TensorEvaluator.17"*, %"struct.Eigen::TensorEvaluator.17"** %3, align 8 | |
%6 = load i32, i32* %4, align 4 | |
%7 = sext i32 %6 to i64 | |
%8 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.17", %"struct.Eigen::TensorEvaluator.17"* %5, i32 0, i32 3 | |
%9 = load float*, float** %8, align 8 | |
%10 = getelementptr inbounds float, float* %9, i64 %7 | |
%11 = load float, float* %10, align 4 | |
ret float %11 | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator.16"*, align 8 | |
%4 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator.16"* %0, %"struct.Eigen::TensorEvaluator.16"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%5 = load %"struct.Eigen::TensorEvaluator.16"*, %"struct.Eigen::TensorEvaluator.16"** %3, align 8 | |
%6 = load i32, i32* %4, align 4 | |
%7 = sext i32 %6 to i64 | |
%8 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %5, i32 0, i32 0 | |
%9 = load float*, float** %8, align 8 | |
%10 = getelementptr inbounds float, float* %9, i64 %7 | |
ret float* %10 | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.24"* byval align 8, i32) #0 comdat { | |
%3 = alloca i32, align 4 | |
%4 = alloca i32, align 4 | |
%5 = alloca i32, align 4 | |
%6 = alloca %"struct.Eigen::TensorEvaluator.24", align 8 | |
%7 = alloca i8, align 1 | |
%8 = alloca %"struct.Eigen::TensorEvaluator.24", align 8 | |
store i32 %1, i32* %3, align 4 | |
%9 = bitcast i32* %4 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
%10 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%11 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%12 = mul i32 %10, %11 | |
%13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%14 = add i32 %12, %13 | |
store i32 %14, i32* %4, align 4 | |
%15 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %15) #9 | |
%16 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%17 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%18 = mul i32 %16, %17 | |
store i32 %18, i32* %5, align 4 | |
%19 = bitcast %"struct.Eigen::TensorEvaluator.24"* %6 to i8* | |
call void @llvm.lifetime.start(i64 136, i8* %19) #9 | |
%20 = bitcast %"struct.Eigen::TensorEvaluator.24"* %6 to i8* | |
%21 = bitcast %"struct.Eigen::TensorEvaluator.24"* %0 to i8* | |
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %20, i8* %21, i64 136, i32 8, i1 false) | |
call void @llvm.lifetime.start(i64 1, i8* %7) #9 | |
store i8 0, i8* %7, align 1 | |
%22 = bitcast %"struct.Eigen::TensorEvaluator.24"* %8 to i8* | |
%23 = bitcast %"struct.Eigen::TensorEvaluator.24"* %6 to i8* | |
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %22, i8* %23, i64 136, i32 8, i1 false) | |
%24 = load i32, i32* %4, align 4 | |
%25 = load i32, i32* %3, align 4 | |
%26 = load i32, i32* %5, align 4 | |
call void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii(%"struct.Eigen::TensorEvaluator.24"* byval align 8 %8, i32 %24, i32 %25, i32 %26) #10 | |
call void @llvm.lifetime.end(i64 1, i8* %7) #9 | |
%27 = bitcast %"struct.Eigen::TensorEvaluator.24"* %6 to i8* | |
call void @llvm.lifetime.end(i64 136, i8* %27) #9 | |
%28 = bitcast i32* %5 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %28) #9 | |
%29 = bitcast i32* %4 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %29) #9 | |
ret void | |
} | |
; Function Attrs: alwaysinline convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii(%"struct.Eigen::TensorEvaluator.24"* byval align 8, i32, i32, i32) #2 comdat align 2 { | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
store i32 %1, i32* %5, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%9 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %9) #9 | |
%10 = load i32, i32* %5, align 4 | |
store i32 %10, i32* %8, align 4 | |
br label %11 | |
; <label>:11: ; preds = %19, %4 | |
%12 = load i32, i32* %8, align 4 | |
%13 = load i32, i32* %6, align 4 | |
%14 = icmp slt i32 %12, %13 | |
br i1 %14, label %17, label %15 | |
; <label>:15: ; preds = %11 | |
%16 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %16) #9 | |
br label %23 | |
; <label>:17: ; preds = %11 | |
%18 = load i32, i32* %8, align 4 | |
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.24"* %0, i32 %18) #10 | |
br label %19 | |
; <label>:19: ; preds = %17 | |
%20 = load i32, i32* %7, align 4 | |
%21 = load i32, i32* %8, align 4 | |
%22 = add nsw i32 %21, %20 | |
store i32 %22, i32* %8, align 4 | |
br label %11 | |
; <label>:23: ; preds = %15 | |
ret void | |
} | |
; Function Attrs: convergent inlinehint nounwind | |
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.24"*, i32) #4 comdat align 2 { | |
%3 = alloca %"struct.Eigen::TensorEvaluator.24"*, align 8 | |
%4 = alloca i32, align 4 | |
store %"struct.Eigen::TensorEvaluator.24"* %0, %"struct.Eigen::TensorEvaluator.24"** %3, align 8 | |
store i32 %1, i32* %4, align 4 | |
%5 = load %"struct.Eigen::TensorEvaluator.24"*, %"struct.Eigen::TensorEvaluator.24"** %3, align 8 | |
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %5, i32 0, i32 1 | |
%7 = load i32, i32* %4, align 4 | |
%8 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.12"* %6, i32 %7) #10 | |
%9 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %5, i32 0, i32 0 | |
%10 = load i32, i32* %4, align 4 | |
%11 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"* %9, i32 %10) #10 | |
store float %8, float* %11, align 4 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_(float, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #0 comdat { | |
%4 = alloca float, align 4 | |
%5 = alloca i32, align 4 | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
store float %0, float* %4, align 4 | |
store i32 %1, i32* %5, align 4 | |
%8 = bitcast i32* %6 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %8) #9 | |
%9 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
%10 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%11 = mul i32 %9, %10 | |
%12 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
%13 = add i32 %11, %12 | |
store i32 %13, i32* %6, align 4 | |
%14 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %14) #9 | |
%15 = load i32, i32* %6, align 4 | |
store i32 %15, i32* %7, align 4 | |
br label %16 | |
; <label>:16: ; preds = %26, %3 | |
%17 = load i32, i32* %7, align 4 | |
%18 = load i32, i32* %5, align 4 | |
%19 = icmp slt i32 %17, %18 | |
br i1 %19, label %22, label %20 | |
; <label>:20: ; preds = %16 | |
%21 = bitcast i32* %7 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %21) #9 | |
br label %32 | |
; <label>:22: ; preds = %16 | |
%23 = load float, float* %4, align 4 | |
%24 = load i32, i32* %7, align 4 | |
%25 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"* %2, i32 %24) #10 | |
store float %23, float* %25, align 4 | |
br label %26 | |
; <label>:26: ; preds = %22 | |
%27 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%29 = mul i32 %27, %28 | |
%30 = load i32, i32* %7, align 4 | |
%31 = add i32 %30, %29 | |
store i32 %31, i32* %7, align 4 | |
br label %16 | |
; <label>:32: ; preds = %20 | |
%33 = bitcast i32* %6 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %33) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32, align 4 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32 | |
%15 = alloca i32, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca i32, align 4 | |
%18 = alloca i32, align 4 | |
%19 = alloca float, align 4 | |
%20 = alloca i32, align 4 | |
%21 = alloca i32, align 4 | |
%22 = alloca i32, align 4 | |
%23 = alloca i32, align 4 | |
%24 = alloca i32, align 4 | |
%25 = alloca i32, align 4 | |
%26 = alloca i32, align 4 | |
%27 = alloca i32, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%29 = icmp eq i32 %28, 256 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %5 | |
br label %32 | |
; <label>:31: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%34 = icmp eq i32 %33, 1 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%44 = icmp eq i32 %43, 32 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%49 = icmp eq i32 %48, 1 | |
br i1 %49, label %50, label %51 | |
; <label>:50: ; preds = %47 | |
br label %52 | |
; <label>:51: ; preds = %47 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %52 | |
; <label>:52: ; preds = %51, %50 | |
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%54 = icmp eq i32 %53, 1 | |
br i1 %54, label %55, label %56 | |
; <label>:55: ; preds = %52 | |
br label %57 | |
; <label>:56: ; preds = %52 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %57 | |
; <label>:57: ; preds = %56, %55 | |
%58 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %58) #9 | |
store i32 16, i32* %8, align 4 | |
%59 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %7, align 4 | |
%61 = add nsw i32 %60, 32768 | |
%62 = sub nsw i32 %61, 1 | |
%63 = sdiv i32 %62, 32768 | |
store i32 %63, i32* %9, align 4 | |
%64 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %64) #9 | |
%65 = load i32, i32* %9, align 4 | |
%66 = load i32, i32* %6, align 4 | |
%67 = mul nsw i32 %65, %66 | |
store i32 %67, i32* %10, align 4 | |
%68 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %68) #9 | |
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %69, i32* %11, align 4 | |
%70 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %70) #9 | |
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %71, i32* %12, align 4 | |
%72 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %72) #9 | |
%73 = load i32, i32* %11, align 4 | |
store i32 %73, i32* %13, align 4 | |
br label %74 | |
; <label>:74: ; preds = %237, %57 | |
%75 = load i32, i32* %13, align 4 | |
%76 = load i32, i32* %10, align 4 | |
%77 = icmp slt i32 %75, %76 | |
br i1 %77, label %80, label %78 | |
; <label>:78: ; preds = %74 | |
store i32 2, i32* %14, align 4 | |
%79 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %79) #9 | |
br label %240 | |
; <label>:80: ; preds = %74 | |
%81 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %81) #9 | |
%82 = load i32, i32* %13, align 4 | |
%83 = load i32, i32* %9, align 4 | |
%84 = srem i32 %82, %83 | |
store i32 %84, i32* %15, align 4 | |
%85 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = load i32, i32* %13, align 4 | |
%87 = load i32, i32* %9, align 4 | |
%88 = sdiv i32 %86, %87 | |
store i32 %88, i32* %16, align 4 | |
%89 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %89) #9 | |
%90 = load i32, i32* %15, align 4 | |
%91 = mul nsw i32 %90, 256 | |
%92 = mul nsw i32 %91, 128 | |
%93 = load i32, i32* %12, align 4 | |
%94 = add nsw i32 %92, %93 | |
store i32 %94, i32* %17, align 4 | |
%95 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %95) #9 | |
%96 = load i32, i32* %16, align 4 | |
store i32 %96, i32* %18, align 4 | |
%97 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %97) #9 | |
%98 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%99 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %98) #10 | |
store float %99, float* %19, align 4 | |
%100 = load i32, i32* %18, align 4 | |
%101 = load i32, i32* %6, align 4 | |
%102 = icmp slt i32 %100, %101 | |
br i1 %102, label %103, label %198 | |
; <label>:103: ; preds = %80 | |
%104 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %104) #9 | |
store i32 0, i32* %20, align 4 | |
br label %105 | |
; <label>:105: ; preds = %192, %103 | |
%106 = load i32, i32* %20, align 4 | |
%107 = icmp slt i32 %106, 128 | |
br i1 %107, label %109, label %108 | |
; <label>:108: ; preds = %105 | |
store i32 5, i32* %14, align 4 | |
br label %195 | |
; <label>:109: ; preds = %105 | |
%110 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %110) #9 | |
%111 = load i32, i32* %17, align 4 | |
%112 = load i32, i32* %20, align 4 | |
%113 = add nsw i32 %112, 16 | |
%114 = sub nsw i32 %113, 1 | |
%115 = mul nsw i32 256, %114 | |
%116 = add nsw i32 %111, %115 | |
store i32 %116, i32* %21, align 4 | |
%117 = load i32, i32* %21, align 4 | |
%118 = load i32, i32* %7, align 4 | |
%119 = icmp sge i32 %117, %118 | |
br i1 %119, label %120, label %158 | |
; <label>:120: ; preds = %109 | |
%121 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %121) #9 | |
store i32 0, i32* %22, align 4 | |
br label %122 | |
; <label>:122: ; preds = %152, %120 | |
%123 = load i32, i32* %22, align 4 | |
%124 = icmp slt i32 %123, 15 | |
br i1 %124, label %126, label %125 | |
; <label>:125: ; preds = %122 | |
store i32 8, i32* %14, align 4 | |
br label %155 | |
; <label>:126: ; preds = %122 | |
%127 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %127) #9 | |
%128 = load i32, i32* %17, align 4 | |
%129 = load i32, i32* %20, align 4 | |
%130 = load i32, i32* %22, align 4 | |
%131 = add nsw i32 %129, %130 | |
%132 = mul nsw i32 256, %131 | |
%133 = add nsw i32 %128, %132 | |
store i32 %133, i32* %23, align 4 | |
%134 = load i32, i32* %23, align 4 | |
%135 = load i32, i32* %7, align 4 | |
%136 = icmp sge i32 %134, %135 | |
br i1 %136, label %137, label %138 | |
; <label>:137: ; preds = %126 | |
store i32 8, i32* %14, align 4 | |
br label %148 | |
; <label>:138: ; preds = %126 | |
%139 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%140 = load float, float* %19, align 4 | |
%141 = load i32, i32* %18, align 4 | |
%142 = load i32, i32* %7, align 4 | |
%143 = mul nsw i32 %141, %142 | |
%144 = load i32, i32* %23, align 4 | |
%145 = add nsw i32 %143, %144 | |
%146 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %145) #10 | |
%147 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %139, float %140, float %146) #10 | |
store float %147, float* %19, align 4 | |
store i32 0, i32* %14, align 4 | |
br label %148 | |
; <label>:148: ; preds = %138, %137 | |
%149 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %149) #9 | |
%150 = load i32, i32* %14, align 4 | |
switch i32 %150, label %155 [ | |
i32 0, label %151 | |
] | |
; <label>:151: ; preds = %148 | |
br label %152 | |
; <label>:152: ; preds = %151 | |
%153 = load i32, i32* %22, align 4 | |
%154 = add nsw i32 %153, 1 | |
store i32 %154, i32* %22, align 4 | |
br label %122, !llvm.loop !80 | |
; <label>:155: ; preds = %148, %125 | |
%156 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %156) #9 | |
br label %157 | |
; <label>:157: ; preds = %155 | |
store i32 5, i32* %14, align 4 | |
br label %188 | |
; <label>:158: ; preds = %109 | |
%159 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %159) #9 | |
store i32 0, i32* %24, align 4 | |
br label %160 | |
; <label>:160: ; preds = %183, %158 | |
%161 = load i32, i32* %24, align 4 | |
%162 = icmp slt i32 %161, 16 | |
br i1 %162, label %165, label %163 | |
; <label>:163: ; preds = %160 | |
store i32 11, i32* %14, align 4 | |
%164 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %164) #9 | |
br label %186 | |
; <label>:165: ; preds = %160 | |
%166 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %166) #9 | |
%167 = load i32, i32* %17, align 4 | |
%168 = load i32, i32* %20, align 4 | |
%169 = load i32, i32* %24, align 4 | |
%170 = add nsw i32 %168, %169 | |
%171 = mul nsw i32 256, %170 | |
%172 = add nsw i32 %167, %171 | |
store i32 %172, i32* %25, align 4 | |
%173 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%174 = load float, float* %19, align 4 | |
%175 = load i32, i32* %18, align 4 | |
%176 = load i32, i32* %7, align 4 | |
%177 = mul nsw i32 %175, %176 | |
%178 = load i32, i32* %25, align 4 | |
%179 = add nsw i32 %177, %178 | |
%180 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %179) #10 | |
%181 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %173, float %174, float %180) #10 | |
store float %181, float* %19, align 4 | |
%182 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %182) #9 | |
br label %183 | |
; <label>:183: ; preds = %165 | |
%184 = load i32, i32* %24, align 4 | |
%185 = add nsw i32 %184, 1 | |
store i32 %185, i32* %24, align 4 | |
br label %160, !llvm.loop !81 | |
; <label>:186: ; preds = %163 | |
br label %187 | |
; <label>:187: ; preds = %186 | |
store i32 0, i32* %14, align 4 | |
br label %188 | |
; <label>:188: ; preds = %187, %157 | |
%189 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %189) #9 | |
%190 = load i32, i32* %14, align 4 | |
switch i32 %190, label %195 [ | |
i32 0, label %191 | |
] | |
; <label>:191: ; preds = %188 | |
br label %192 | |
; <label>:192: ; preds = %191 | |
%193 = load i32, i32* %20, align 4 | |
%194 = add nsw i32 %193, 16 | |
store i32 %194, i32* %20, align 4 | |
br label %105, !llvm.loop !82 | |
; <label>:195: ; preds = %188, %108 | |
%196 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %196) #9 | |
br label %197 | |
; <label>:197: ; preds = %195 | |
br label %198 | |
; <label>:198: ; preds = %197, %80 | |
%199 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %199) #9 | |
store i32 16, i32* %26, align 4 | |
br label %200 | |
; <label>:200: ; preds = %212, %198 | |
%201 = load i32, i32* %26, align 4 | |
%202 = icmp sgt i32 %201, 0 | |
br i1 %202, label %205, label %203 | |
; <label>:203: ; preds = %200 | |
store i32 14, i32* %14, align 4 | |
%204 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %204) #9 | |
br label %215 | |
; <label>:205: ; preds = %200 | |
%206 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%207 = load float, float* %19, align 4 | |
%208 = load i32, i32* %26, align 4 | |
%209 = call float @_ZL11__shfl_downfji(float %207, i32 %208, i32 32) #10 | |
%210 = load float, float* %19, align 4 | |
%211 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %206, float %209, float %210) #10 | |
store float %211, float* %19, align 4 | |
br label %212 | |
; <label>:212: ; preds = %205 | |
%213 = load i32, i32* %26, align 4 | |
%214 = sdiv i32 %213, 2 | |
store i32 %214, i32* %26, align 4 | |
br label %200, !llvm.loop !83 | |
; <label>:215: ; preds = %203 | |
%216 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %216) #9 | |
%217 = load i32, i32* %12, align 4 | |
%218 = and i32 %217, 31 | |
store i32 %218, i32* %27, align 4 | |
%219 = load i32, i32* %27, align 4 | |
%220 = icmp eq i32 %219, 0 | |
br i1 %220, label %221, label %230 | |
; <label>:221: ; preds = %215 | |
%222 = load i32, i32* %18, align 4 | |
%223 = load i32, i32* %6, align 4 | |
%224 = icmp slt i32 %222, %223 | |
br i1 %224, label %225, label %230 | |
; <label>:225: ; preds = %221 | |
%226 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%227 = load i32, i32* %18, align 4 | |
%228 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"* %4, i32 %227) #10 | |
%229 = load float, float* %19, align 4 | |
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %226, float* %228, float %229) #10 | |
br label %230 | |
; <label>:230: ; preds = %225, %221, %215 | |
%231 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %231) #9 | |
%232 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %232) #9 | |
%233 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %233) #9 | |
%234 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %234) #9 | |
%235 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %235) #9 | |
%236 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %236) #9 | |
br label %237 | |
; <label>:237: ; preds = %230 | |
%238 = load i32, i32* %13, align 4 | |
%239 = add nsw i32 %238, 32 | |
store i32 %239, i32* %13, align 4 | |
br label %74 | |
; <label>:240: ; preds = %78 | |
%241 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %241) #9 | |
%242 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %242) #9 | |
%243 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %243) #9 | |
%244 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %244) #9 | |
%245 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %245) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32, align 4 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32 | |
%15 = alloca i32, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca i32, align 4 | |
%18 = alloca i32, align 4 | |
%19 = alloca float, align 4 | |
%20 = alloca i32, align 4 | |
%21 = alloca i32, align 4 | |
%22 = alloca i32, align 4 | |
%23 = alloca i32, align 4 | |
%24 = alloca i32, align 4 | |
%25 = alloca i32, align 4 | |
%26 = alloca i32, align 4 | |
%27 = alloca i32, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%29 = icmp eq i32 %28, 256 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %5 | |
br label %32 | |
; <label>:31: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%34 = icmp eq i32 %33, 1 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%44 = icmp eq i32 %43, 32 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%49 = icmp eq i32 %48, 1 | |
br i1 %49, label %50, label %51 | |
; <label>:50: ; preds = %47 | |
br label %52 | |
; <label>:51: ; preds = %47 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %52 | |
; <label>:52: ; preds = %51, %50 | |
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%54 = icmp eq i32 %53, 1 | |
br i1 %54, label %55, label %56 | |
; <label>:55: ; preds = %52 | |
br label %57 | |
; <label>:56: ; preds = %52 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %57 | |
; <label>:57: ; preds = %56, %55 | |
%58 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %58) #9 | |
store i32 16, i32* %8, align 4 | |
%59 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %7, align 4 | |
%61 = add nsw i32 %60, 32768 | |
%62 = sub nsw i32 %61, 1 | |
%63 = sdiv i32 %62, 32768 | |
store i32 %63, i32* %9, align 4 | |
%64 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %64) #9 | |
%65 = load i32, i32* %9, align 4 | |
%66 = load i32, i32* %6, align 4 | |
%67 = mul nsw i32 %65, %66 | |
store i32 %67, i32* %10, align 4 | |
%68 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %68) #9 | |
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %69, i32* %11, align 4 | |
%70 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %70) #9 | |
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %71, i32* %12, align 4 | |
%72 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %72) #9 | |
%73 = load i32, i32* %11, align 4 | |
store i32 %73, i32* %13, align 4 | |
br label %74 | |
; <label>:74: ; preds = %232, %57 | |
%75 = load i32, i32* %13, align 4 | |
%76 = load i32, i32* %10, align 4 | |
%77 = icmp slt i32 %75, %76 | |
br i1 %77, label %80, label %78 | |
; <label>:78: ; preds = %74 | |
store i32 2, i32* %14, align 4 | |
%79 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %79) #9 | |
br label %235 | |
; <label>:80: ; preds = %74 | |
%81 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %81) #9 | |
%82 = load i32, i32* %13, align 4 | |
%83 = load i32, i32* %9, align 4 | |
%84 = srem i32 %82, %83 | |
store i32 %84, i32* %15, align 4 | |
%85 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = load i32, i32* %13, align 4 | |
%87 = load i32, i32* %9, align 4 | |
%88 = sdiv i32 %86, %87 | |
store i32 %88, i32* %16, align 4 | |
%89 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %89) #9 | |
%90 = load i32, i32* %15, align 4 | |
%91 = mul nsw i32 %90, 256 | |
%92 = mul nsw i32 %91, 128 | |
%93 = load i32, i32* %12, align 4 | |
%94 = add nsw i32 %92, %93 | |
store i32 %94, i32* %17, align 4 | |
%95 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %95) #9 | |
%96 = load i32, i32* %16, align 4 | |
store i32 %96, i32* %18, align 4 | |
%97 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %97) #9 | |
%98 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10 | |
store float %98, float* %19, align 4 | |
%99 = load i32, i32* %18, align 4 | |
%100 = load i32, i32* %6, align 4 | |
%101 = icmp slt i32 %99, %100 | |
br i1 %101, label %102, label %195 | |
; <label>:102: ; preds = %80 | |
%103 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %103) #9 | |
store i32 0, i32* %20, align 4 | |
br label %104 | |
; <label>:104: ; preds = %189, %102 | |
%105 = load i32, i32* %20, align 4 | |
%106 = icmp slt i32 %105, 128 | |
br i1 %106, label %108, label %107 | |
; <label>:107: ; preds = %104 | |
store i32 5, i32* %14, align 4 | |
br label %192 | |
; <label>:108: ; preds = %104 | |
%109 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %109) #9 | |
%110 = load i32, i32* %17, align 4 | |
%111 = load i32, i32* %20, align 4 | |
%112 = add nsw i32 %111, 16 | |
%113 = sub nsw i32 %112, 1 | |
%114 = mul nsw i32 256, %113 | |
%115 = add nsw i32 %110, %114 | |
store i32 %115, i32* %21, align 4 | |
%116 = load i32, i32* %21, align 4 | |
%117 = load i32, i32* %7, align 4 | |
%118 = icmp sge i32 %116, %117 | |
br i1 %118, label %119, label %156 | |
; <label>:119: ; preds = %108 | |
%120 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %120) #9 | |
store i32 0, i32* %22, align 4 | |
br label %121 | |
; <label>:121: ; preds = %150, %119 | |
%122 = load i32, i32* %22, align 4 | |
%123 = icmp slt i32 %122, 15 | |
br i1 %123, label %125, label %124 | |
; <label>:124: ; preds = %121 | |
store i32 8, i32* %14, align 4 | |
br label %153 | |
; <label>:125: ; preds = %121 | |
%126 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %126) #9 | |
%127 = load i32, i32* %17, align 4 | |
%128 = load i32, i32* %20, align 4 | |
%129 = load i32, i32* %22, align 4 | |
%130 = add nsw i32 %128, %129 | |
%131 = mul nsw i32 256, %130 | |
%132 = add nsw i32 %127, %131 | |
store i32 %132, i32* %23, align 4 | |
%133 = load i32, i32* %23, align 4 | |
%134 = load i32, i32* %7, align 4 | |
%135 = icmp sge i32 %133, %134 | |
br i1 %135, label %136, label %137 | |
; <label>:136: ; preds = %125 | |
store i32 8, i32* %14, align 4 | |
br label %146 | |
; <label>:137: ; preds = %125 | |
%138 = load float, float* %19, align 4 | |
%139 = load i32, i32* %18, align 4 | |
%140 = load i32, i32* %7, align 4 | |
%141 = mul nsw i32 %139, %140 | |
%142 = load i32, i32* %23, align 4 | |
%143 = add nsw i32 %141, %142 | |
%144 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %143) #10 | |
%145 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %138, float %144) #10 | |
store float %145, float* %19, align 4 | |
store i32 0, i32* %14, align 4 | |
br label %146 | |
; <label>:146: ; preds = %137, %136 | |
%147 = bitcast i32* %23 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %147) #9 | |
%148 = load i32, i32* %14, align 4 | |
switch i32 %148, label %153 [ | |
i32 0, label %149 | |
] | |
; <label>:149: ; preds = %146 | |
br label %150 | |
; <label>:150: ; preds = %149 | |
%151 = load i32, i32* %22, align 4 | |
%152 = add nsw i32 %151, 1 | |
store i32 %152, i32* %22, align 4 | |
br label %121, !llvm.loop !84 | |
; <label>:153: ; preds = %146, %124 | |
%154 = bitcast i32* %22 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %154) #9 | |
br label %155 | |
; <label>:155: ; preds = %153 | |
store i32 5, i32* %14, align 4 | |
br label %185 | |
; <label>:156: ; preds = %108 | |
%157 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %157) #9 | |
store i32 0, i32* %24, align 4 | |
br label %158 | |
; <label>:158: ; preds = %180, %156 | |
%159 = load i32, i32* %24, align 4 | |
%160 = icmp slt i32 %159, 16 | |
br i1 %160, label %163, label %161 | |
; <label>:161: ; preds = %158 | |
store i32 11, i32* %14, align 4 | |
%162 = bitcast i32* %24 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %162) #9 | |
br label %183 | |
; <label>:163: ; preds = %158 | |
%164 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %164) #9 | |
%165 = load i32, i32* %17, align 4 | |
%166 = load i32, i32* %20, align 4 | |
%167 = load i32, i32* %24, align 4 | |
%168 = add nsw i32 %166, %167 | |
%169 = mul nsw i32 256, %168 | |
%170 = add nsw i32 %165, %169 | |
store i32 %170, i32* %25, align 4 | |
%171 = load float, float* %19, align 4 | |
%172 = load i32, i32* %18, align 4 | |
%173 = load i32, i32* %7, align 4 | |
%174 = mul nsw i32 %172, %173 | |
%175 = load i32, i32* %25, align 4 | |
%176 = add nsw i32 %174, %175 | |
%177 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %176) #10 | |
%178 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %171, float %177) #10 | |
store float %178, float* %19, align 4 | |
%179 = bitcast i32* %25 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %179) #9 | |
br label %180 | |
; <label>:180: ; preds = %163 | |
%181 = load i32, i32* %24, align 4 | |
%182 = add nsw i32 %181, 1 | |
store i32 %182, i32* %24, align 4 | |
br label %158, !llvm.loop !85 | |
; <label>:183: ; preds = %161 | |
br label %184 | |
; <label>:184: ; preds = %183 | |
store i32 0, i32* %14, align 4 | |
br label %185 | |
; <label>:185: ; preds = %184, %155 | |
%186 = bitcast i32* %21 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %186) #9 | |
%187 = load i32, i32* %14, align 4 | |
switch i32 %187, label %192 [ | |
i32 0, label %188 | |
] | |
; <label>:188: ; preds = %185 | |
br label %189 | |
; <label>:189: ; preds = %188 | |
%190 = load i32, i32* %20, align 4 | |
%191 = add nsw i32 %190, 16 | |
store i32 %191, i32* %20, align 4 | |
br label %104, !llvm.loop !86 | |
; <label>:192: ; preds = %185, %107 | |
%193 = bitcast i32* %20 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %193) #9 | |
br label %194 | |
; <label>:194: ; preds = %192 | |
br label %195 | |
; <label>:195: ; preds = %194, %80 | |
%196 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %196) #9 | |
store i32 16, i32* %26, align 4 | |
br label %197 | |
; <label>:197: ; preds = %208, %195 | |
%198 = load i32, i32* %26, align 4 | |
%199 = icmp sgt i32 %198, 0 | |
br i1 %199, label %202, label %200 | |
; <label>:200: ; preds = %197 | |
store i32 14, i32* %14, align 4 | |
%201 = bitcast i32* %26 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %201) #9 | |
br label %211 | |
; <label>:202: ; preds = %197 | |
%203 = load float, float* %19, align 4 | |
%204 = load i32, i32* %26, align 4 | |
%205 = call float @_ZL11__shfl_downfji(float %203, i32 %204, i32 32) #10 | |
%206 = load float, float* %19, align 4 | |
%207 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %205, float %206) #10 | |
store float %207, float* %19, align 4 | |
br label %208 | |
; <label>:208: ; preds = %202 | |
%209 = load i32, i32* %26, align 4 | |
%210 = sdiv i32 %209, 2 | |
store i32 %210, i32* %26, align 4 | |
br label %197, !llvm.loop !87 | |
; <label>:211: ; preds = %200 | |
%212 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %212) #9 | |
%213 = load i32, i32* %12, align 4 | |
%214 = and i32 %213, 31 | |
store i32 %214, i32* %27, align 4 | |
%215 = load i32, i32* %27, align 4 | |
%216 = icmp eq i32 %215, 0 | |
br i1 %216, label %217, label %225 | |
; <label>:217: ; preds = %211 | |
%218 = load i32, i32* %18, align 4 | |
%219 = load i32, i32* %6, align 4 | |
%220 = icmp slt i32 %218, %219 | |
br i1 %220, label %221, label %225 | |
; <label>:221: ; preds = %217 | |
%222 = load i32, i32* %18, align 4 | |
%223 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"* %4, i32 %222) #10 | |
%224 = load float, float* %19, align 4 | |
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %223, float %224) #10 | |
br label %225 | |
; <label>:225: ; preds = %221, %217, %211 | |
%226 = bitcast i32* %27 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %226) #9 | |
%227 = bitcast float* %19 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %227) #9 | |
%228 = bitcast i32* %18 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %228) #9 | |
%229 = bitcast i32* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %229) #9 | |
%230 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %230) #9 | |
%231 = bitcast i32* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %231) #9 | |
br label %232 | |
; <label>:232: ; preds = %225 | |
%233 = load i32, i32* %13, align 4 | |
%234 = add nsw i32 %233, 32 | |
store i32 %234, i32* %13, align 4 | |
br label %74 | |
; <label>:235: ; preds = %78 | |
%236 = bitcast i32* %12 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %236) #9 | |
%237 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %237) #9 | |
%238 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %238) #9 | |
%239 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %239) #9 | |
%240 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %240) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32, align 4 | |
%15 = alloca float, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca float, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%19 = icmp eq i32 %18, 256 | |
br i1 %19, label %20, label %21 | |
; <label>:20: ; preds = %5 | |
br label %22 | |
; <label>:21: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %22 | |
; <label>:22: ; preds = %21, %20 | |
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%24 = icmp eq i32 %23, 1 | |
br i1 %24, label %25, label %26 | |
; <label>:25: ; preds = %22 | |
br label %27 | |
; <label>:26: ; preds = %22 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %27 | |
; <label>:27: ; preds = %26, %25 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%29 = icmp eq i32 %28, 1 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %27 | |
br label %32 | |
; <label>:31: ; preds = %27 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%34 = icmp eq i32 %33, 128 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%44 = icmp eq i32 %43, 1 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %48) #9 | |
%49 = load i32, i32* %6, align 4 | |
%50 = add nsw i32 %49, 16 | |
%51 = sub nsw i32 %50, 1 | |
%52 = sdiv i32 %51, 16 | |
%53 = load i32, i32* %7, align 4 | |
%54 = mul nsw i32 %52, %53 | |
store i32 %54, i32* %8, align 4 | |
%55 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %55) #9 | |
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %56, i32* %9, align 4 | |
%57 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %57) #9 | |
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %58, i32* %10, align 4 | |
%59 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %9, align 4 | |
%61 = mul nsw i32 %60, 256 | |
%62 = load i32, i32* %10, align 4 | |
%63 = add nsw i32 %61, %62 | |
store i32 %63, i32* %11, align 4 | |
br label %64 | |
; <label>:64: ; preds = %135, %47 | |
%65 = load i32, i32* %11, align 4 | |
%66 = load i32, i32* %8, align 4 | |
%67 = icmp slt i32 %65, %66 | |
br i1 %67, label %70, label %68 | |
; <label>:68: ; preds = %64 | |
store i32 2, i32* %12, align 4 | |
%69 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %69) #9 | |
br label %138 | |
; <label>:70: ; preds = %64 | |
%71 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %71) #9 | |
%72 = load i32, i32* %11, align 4 | |
%73 = load i32, i32* %7, align 4 | |
%74 = srem i32 %72, %73 | |
store i32 %74, i32* %13, align 4 | |
%75 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %75) #9 | |
%76 = load i32, i32* %11, align 4 | |
%77 = load i32, i32* %7, align 4 | |
%78 = sdiv i32 %76, %77 | |
%79 = load i32, i32* %6, align 4 | |
%80 = add nsw i32 %79, 16 | |
%81 = sub nsw i32 %80, 1 | |
%82 = sdiv i32 %81, 16 | |
%83 = srem i32 %78, %82 | |
%84 = mul nsw i32 %83, 16 | |
store i32 %84, i32* %14, align 4 | |
%85 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%87 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %86) #10 | |
store float %87, float* %15, align 4 | |
%88 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %88) #9 | |
store i32 0, i32* %16, align 4 | |
br label %89 | |
; <label>:89: ; preds = %124, %70 | |
%90 = load i32, i32* %16, align 4 | |
%91 = icmp slt i32 %90, 16 | |
br i1 %91, label %94, label %92 | |
; <label>:92: ; preds = %89 | |
store i32 5, i32* %12, align 4 | |
%93 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %93) #9 | |
br label %127 | |
; <label>:94: ; preds = %89 | |
%95 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %95) #9 | |
%96 = load i32, i32* %13, align 4 | |
%97 = load i32, i32* %7, align 4 | |
%98 = icmp slt i32 %96, %97 | |
br i1 %98, label %99, label %114 | |
; <label>:99: ; preds = %94 | |
%100 = load i32, i32* %14, align 4 | |
%101 = load i32, i32* %16, align 4 | |
%102 = add nsw i32 %100, %101 | |
%103 = load i32, i32* %6, align 4 | |
%104 = icmp slt i32 %102, %103 | |
br i1 %104, label %105, label %114 | |
; <label>:105: ; preds = %99 | |
%106 = load i32, i32* %14, align 4 | |
%107 = load i32, i32* %16, align 4 | |
%108 = add nsw i32 %106, %107 | |
%109 = load i32, i32* %7, align 4 | |
%110 = mul nsw i32 %108, %109 | |
%111 = load i32, i32* %13, align 4 | |
%112 = add nsw i32 %110, %111 | |
%113 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %112) #10 | |
br label %117 | |
; <label>:114: ; preds = %99, %94 | |
%115 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%116 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %115) #10 | |
br label %117 | |
; <label>:117: ; preds = %114, %105 | |
%118 = phi float [ %113, %105 ], [ %116, %114 ] | |
store float %118, float* %17, align 4 | |
%119 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%120 = load float, float* %15, align 4 | |
%121 = load float, float* %17, align 4 | |
%122 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %119, float %120, float %121) #10 | |
store float %122, float* %15, align 4 | |
%123 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %123) #9 | |
br label %124 | |
; <label>:124: ; preds = %117 | |
%125 = load i32, i32* %16, align 4 | |
%126 = add nsw i32 %125, 1 | |
store i32 %126, i32* %16, align 4 | |
br label %89 | |
; <label>:127: ; preds = %92 | |
%128 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* | |
%129 = load i32, i32* %13, align 4 | |
%130 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"* %4, i32 %129) #10 | |
%131 = load float, float* %15, align 4 | |
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %128, float* %130, float %131) #10 | |
%132 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %132) #9 | |
%133 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %133) #9 | |
%134 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %134) #9 | |
br label %135 | |
; <label>:135: ; preds = %127 | |
%136 = load i32, i32* %11, align 4 | |
%137 = add nsw i32 %136, 32768 | |
store i32 %137, i32* %11, align 4 | |
br label %64 | |
; <label>:138: ; preds = %68 | |
%139 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %139) #9 | |
%140 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %140) #9 | |
%141 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %141) #9 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #0 comdat { | |
%6 = alloca i32, align 4 | |
%7 = alloca i32, align 4 | |
%8 = alloca i32, align 4 | |
%9 = alloca i32, align 4 | |
%10 = alloca i32, align 4 | |
%11 = alloca i32, align 4 | |
%12 = alloca i32 | |
%13 = alloca i32, align 4 | |
%14 = alloca i32, align 4 | |
%15 = alloca float, align 4 | |
%16 = alloca i32, align 4 | |
%17 = alloca float, align 4 | |
store i32 %2, i32* %6, align 4 | |
store i32 %3, i32* %7, align 4 | |
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10 | |
%19 = icmp eq i32 %18, 256 | |
br i1 %19, label %20, label %21 | |
; <label>:20: ; preds = %5 | |
br label %22 | |
; <label>:21: ; preds = %5 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %22 | |
; <label>:22: ; preds = %21, %20 | |
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10 | |
%24 = icmp eq i32 %23, 1 | |
br i1 %24, label %25, label %26 | |
; <label>:25: ; preds = %22 | |
br label %27 | |
; <label>:26: ; preds = %22 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %27 | |
; <label>:27: ; preds = %26, %25 | |
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10 | |
%29 = icmp eq i32 %28, 1 | |
br i1 %29, label %30, label %31 | |
; <label>:30: ; preds = %27 | |
br label %32 | |
; <label>:31: ; preds = %27 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %32 | |
; <label>:32: ; preds = %31, %30 | |
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10 | |
%34 = icmp eq i32 %33, 128 | |
br i1 %34, label %35, label %36 | |
; <label>:35: ; preds = %32 | |
br label %37 | |
; <label>:36: ; preds = %32 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %37 | |
; <label>:37: ; preds = %36, %35 | |
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10 | |
%39 = icmp eq i32 %38, 1 | |
br i1 %39, label %40, label %41 | |
; <label>:40: ; preds = %37 | |
br label %42 | |
; <label>:41: ; preds = %37 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %42 | |
; <label>:42: ; preds = %41, %40 | |
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10 | |
%44 = icmp eq i32 %43, 1 | |
br i1 %44, label %45, label %46 | |
; <label>:45: ; preds = %42 | |
br label %47 | |
; <label>:46: ; preds = %42 | |
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10 | |
br label %47 | |
; <label>:47: ; preds = %46, %45 | |
%48 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %48) #9 | |
%49 = load i32, i32* %6, align 4 | |
%50 = add nsw i32 %49, 16 | |
%51 = sub nsw i32 %50, 1 | |
%52 = sdiv i32 %51, 16 | |
%53 = load i32, i32* %7, align 4 | |
%54 = mul nsw i32 %52, %53 | |
store i32 %54, i32* %8, align 4 | |
%55 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %55) #9 | |
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %56, i32* %9, align 4 | |
%57 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %57) #9 | |
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10 | |
store i32 %58, i32* %10, align 4 | |
%59 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %59) #9 | |
%60 = load i32, i32* %9, align 4 | |
%61 = mul nsw i32 %60, 256 | |
%62 = load i32, i32* %10, align 4 | |
%63 = add nsw i32 %61, %62 | |
store i32 %63, i32* %11, align 4 | |
br label %64 | |
; <label>:64: ; preds = %131, %47 | |
%65 = load i32, i32* %11, align 4 | |
%66 = load i32, i32* %8, align 4 | |
%67 = icmp slt i32 %65, %66 | |
br i1 %67, label %70, label %68 | |
; <label>:68: ; preds = %64 | |
store i32 2, i32* %12, align 4 | |
%69 = bitcast i32* %11 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %69) #9 | |
br label %134 | |
; <label>:70: ; preds = %64 | |
%71 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %71) #9 | |
%72 = load i32, i32* %11, align 4 | |
%73 = load i32, i32* %7, align 4 | |
%74 = srem i32 %72, %73 | |
store i32 %74, i32* %13, align 4 | |
%75 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %75) #9 | |
%76 = load i32, i32* %11, align 4 | |
%77 = load i32, i32* %7, align 4 | |
%78 = sdiv i32 %76, %77 | |
%79 = load i32, i32* %6, align 4 | |
%80 = add nsw i32 %79, 16 | |
%81 = sub nsw i32 %80, 1 | |
%82 = sdiv i32 %81, 16 | |
%83 = srem i32 %78, %82 | |
%84 = mul nsw i32 %83, 16 | |
store i32 %84, i32* %14, align 4 | |
%85 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %85) #9 | |
%86 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10 | |
store float %86, float* %15, align 4 | |
%87 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %87) #9 | |
store i32 0, i32* %16, align 4 | |
br label %88 | |
; <label>:88: ; preds = %121, %70 | |
%89 = load i32, i32* %16, align 4 | |
%90 = icmp slt i32 %89, 16 | |
br i1 %90, label %93, label %91 | |
; <label>:91: ; preds = %88 | |
store i32 5, i32* %12, align 4 | |
%92 = bitcast i32* %16 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %92) #9 | |
br label %124 | |
; <label>:93: ; preds = %88 | |
%94 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.start(i64 4, i8* %94) #9 | |
%95 = load i32, i32* %13, align 4 | |
%96 = load i32, i32* %7, align 4 | |
%97 = icmp slt i32 %95, %96 | |
br i1 %97, label %98, label %113 | |
; <label>:98: ; preds = %93 | |
%99 = load i32, i32* %14, align 4 | |
%100 = load i32, i32* %16, align 4 | |
%101 = add nsw i32 %99, %100 | |
%102 = load i32, i32* %6, align 4 | |
%103 = icmp slt i32 %101, %102 | |
br i1 %103, label %104, label %113 | |
; <label>:104: ; preds = %98 | |
%105 = load i32, i32* %14, align 4 | |
%106 = load i32, i32* %16, align 4 | |
%107 = add nsw i32 %105, %106 | |
%108 = load i32, i32* %7, align 4 | |
%109 = mul nsw i32 %107, %108 | |
%110 = load i32, i32* %13, align 4 | |
%111 = add nsw i32 %109, %110 | |
%112 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %111) #10 | |
br label %115 | |
; <label>:113: ; preds = %98, %93 | |
%114 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10 | |
br label %115 | |
; <label>:115: ; preds = %113, %104 | |
%116 = phi float [ %112, %104 ], [ %114, %113 ] | |
store float %116, float* %17, align 4 | |
%117 = load float, float* %15, align 4 | |
%118 = load float, float* %17, align 4 | |
%119 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %117, float %118) #10 | |
store float %119, float* %15, align 4 | |
%120 = bitcast float* %17 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %120) #9 | |
br label %121 | |
; <label>:121: ; preds = %115 | |
%122 = load i32, i32* %16, align 4 | |
%123 = add nsw i32 %122, 1 | |
store i32 %123, i32* %16, align 4 | |
br label %88 | |
; <label>:124: ; preds = %91 | |
%125 = load i32, i32* %13, align 4 | |
%126 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"* %4, i32 %125) #10 | |
%127 = load float, float* %15, align 4 | |
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %126, float %127) #10 | |
%128 = bitcast float* %15 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %128) #9 | |
%129 = bitcast i32* %14 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %129) #9 | |
%130 = bitcast i32* %13 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %130) #9 | |
br label %131 | |
; <label>:131: ; preds = %124 | |
%132 = load i32, i32* %11, align 4 | |
%133 = add nsw i32 %132, 32768 | |
store i32 %133, i32* %11, align 4 | |
br label %64 | |
; <label>:134: ; preds = %68 | |
%135 = bitcast i32* %10 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %135) #9 | |
%136 = bitcast i32* %9 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %136) #9 | |
%137 = bitcast i32* %8 to i8* | |
call void @llvm.lifetime.end(i64 4, i8* %137) #9 | |
ret void | |
} | |
; Function Attrs: alwaysinline inlinehint | |
define internal i32 @__nv_umulhi(i32, i32) #8 { | |
%3 = call i32 @llvm.nvvm.mulhi.ui(i32 %0, i32 %1) | |
ret i32 %3 | |
} | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.nvvm.mulhi.ui(i32, i32) #3 | |
; Function Attrs: alwaysinline inlinehint | |
define internal float @__nv_fmaxf(float, float) #8 { | |
%3 = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) | |
%4 = icmp ne i32 %3, 0 | |
br i1 %4, label %5, label %7 | |
; <label>:5: ; preds = %2 | |
%6 = call float @llvm.nvvm.fmax.ftz.f(float %0, float %1) | |
br label %9 | |
; <label>:7: ; preds = %2 | |
%8 = call float @llvm.nvvm.fmax.f(float %0, float %1) | |
br label %9 | |
; <label>:9: ; preds = %7, %5 | |
%10 = phi float [ %6, %5 ], [ %8, %7 ] | |
ret float %10 | |
} | |
declare i32 @__nvvm_reflect(i8*) | |
; Function Attrs: nounwind readnone | |
declare float @llvm.nvvm.fmax.ftz.f(float, float) #3 | |
; Function Attrs: nounwind readnone | |
declare float @llvm.nvvm.fmax.f(float, float) #3 | |
attributes #0 = { convergent nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #1 = { argmemonly nounwind } | |
attributes #2 = { alwaysinline convergent inlinehint nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #3 = { nounwind readnone } | |
attributes #4 = { convergent inlinehint nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #5 = { convergent nounwind } | |
attributes #6 = { argmemonly nounwind readonly } | |
attributes #7 = { convergent noreturn "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #8 = { alwaysinline inlinehint } | |
attributes #9 = { nounwind } | |
attributes #10 = { convergent } | |
attributes #11 = { convergent noreturn } | |
!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !38, !40, !40, !40, !40, !41, !41, !40} | |
!llvm.module.flags = !{!42, !43} | |
!llvm.ident = !{!44} | |
!nvvm.internalize.after.link = !{} | |
!nvvmir.version = !{!45} | |
!0 = !{void (float, i32, float*)* @_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_, !"kernel", i32 1} | |
!1 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, float*)* @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1} | |
!2 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, i32, float*)* @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1} | |
!3 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, i32, float*)* @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1} | |
!4 = !{void (%"struct.Eigen::TensorEvaluator.5"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1} | |
!5 = !{void (%"struct.Eigen::TensorEvaluator.5"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024} | |
!6 = !{void (float, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_, !"kernel", i32 1} | |
!7 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!8 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!9 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!10 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!11 = !{void (%"struct.Eigen::TensorEvaluator.6"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1} | |
!12 = !{void (%"struct.Eigen::TensorEvaluator.6"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024} | |
!13 = !{void (%"struct.Eigen::TensorEvaluator.11"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1} | |
!14 = !{void (%"struct.Eigen::TensorEvaluator.11"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024} | |
!15 = !{void (float, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_, !"kernel", i32 1} | |
!16 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!17 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!18 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!19 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!20 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, float*)* @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1} | |
!21 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, i32, float*)* @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1} | |
!22 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, i32, float*)* @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1} | |
!23 = !{void (%"struct.Eigen::TensorEvaluator.14"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1} | |
!24 = !{void (%"struct.Eigen::TensorEvaluator.14"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024} | |
!25 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!26 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!27 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!28 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!29 = !{void (%"struct.Eigen::TensorEvaluator.15"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1} | |
!30 = !{void (%"struct.Eigen::TensorEvaluator.15"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024} | |
!31 = !{void (%"struct.Eigen::TensorEvaluator.24"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1} | |
!32 = !{void (%"struct.Eigen::TensorEvaluator.24"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024} | |
!33 = !{void (float, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_, !"kernel", i32 1} | |
!34 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!35 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!36 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!37 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!38 = !{null, !"align", i32 8} | |
!39 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} | |
!40 = !{null, !"align", i32 16} | |
!41 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} | |
!42 = !{i32 4, !"nvvm-reflect-ftz", i32 0} | |
!43 = !{i32 1, !"PIC Level", i32 2} | |
!44 = !{!"clang version google3-trunk (trunk r271374)"} | |
!45 = !{i32 1, i32 2} | |
!46 = distinct !{!46, !47} | |
!47 = !{!"llvm.loop.unroll.count", i32 8} | |
!48 = distinct !{!48, !49} | |
!49 = !{!"llvm.loop.unroll.enable"} | |
!50 = !{i32 457534} | |
!51 = distinct !{!51, !49} | |
!52 = distinct !{!52, !49} | |
!53 = distinct !{!53, !49} | |
!54 = distinct !{!54, !55} | |
!55 = !{!"llvm.loop.unroll.disable"} | |
!56 = distinct !{!56, !49} | |
!57 = distinct !{!57, !49} | |
!58 = distinct !{!58, !49} | |
!59 = distinct !{!59, !55} | |
!60 = distinct !{!60, !49} | |
!61 = distinct !{!61, !49} | |
!62 = distinct !{!62, !49} | |
!63 = distinct !{!63, !55} | |
!64 = distinct !{!64, !49} | |
!65 = distinct !{!65, !49} | |
!66 = distinct !{!66, !49} | |
!67 = distinct !{!67, !55} | |
!68 = distinct !{!68, !49} | |
!69 = distinct !{!69, !47} | |
!70 = distinct !{!70, !49} | |
!71 = distinct !{!71, !49} | |
!72 = distinct !{!72, !49} | |
!73 = distinct !{!73, !49} | |
!74 = distinct !{!74, !55} | |
!75 = distinct !{!75, !49} | |
!76 = distinct !{!76, !49} | |
!77 = distinct !{!77, !49} | |
!78 = distinct !{!78, !55} | |
!79 = distinct !{!79, !49} | |
!80 = distinct !{!80, !49} | |
!81 = distinct !{!81, !49} | |
!82 = distinct !{!82, !55} | |
!83 = distinct !{!83, !49} | |
!84 = distinct !{!84, !49} | |
!85 = distinct !{!85, !49} | |
!86 = distinct !{!86, !55} | |
!87 = distinct !{!87, !49} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment