Skip to content

Instantly share code, notes, and snippets.

/-

Created June 1, 2016 22:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/e6e8822a01dde1bb20195b4002d8efc3 to your computer and use it in GitHub Desktop.
Save anonymous/e6e8822a01dde1bb20195b4002d8efc3 to your computer and use it in GitHub Desktop.
; ModuleID = '<stdin>'
source_filename = "cxx11_tensor_reduction_cuda-sm_35.cui"
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.__cuda_builtin_blockDim_t = type { i8 }
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.__cuda_builtin_gridDim_t = type { i8 }
%"struct.Eigen::internal::SumReducer" = type { i8 }
%"struct.Eigen::TensorEvaluator" = type { %"class.Eigen::array", %"struct.Eigen::DSizes", %"class.Eigen::array.1", %"class.Eigen::array.0", %"class.Eigen::array.2", %"class.Eigen::array.0", %"class.Eigen::array.0", i32, %"class.Eigen::array.0", %"class.Eigen::array.0", %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::internal::SumReducer", float*, i64, %"struct.Eigen::GpuDevice"* }
%"class.Eigen::array" = type { [2 x i8] }
%"struct.Eigen::DSizes" = type { %"class.Eigen::array.0" }
%"class.Eigen::array.1" = type { [2 x i32] }
%"class.Eigen::array.2" = type { [1 x %"struct.Eigen::internal::TensorIntDivisor"] }
%"struct.Eigen::internal::TensorIntDivisor" = type { i32, i32, i32 }
%"class.Eigen::array.0" = type { [1 x i32] }
%"struct.Eigen::TensorEvaluator.3" = type { float*, %"struct.Eigen::DSizes.4", %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::DSizes.4" = type { %"class.Eigen::array.1" }
%"struct.Eigen::GpuDevice" = type { %"class.Eigen::StreamInterface"* }
%"class.Eigen::StreamInterface" = type { i32 (...)** }
%"struct.Eigen::internal::scalar_cast_op" = type { i8 }
%"struct.Eigen::TensorEvaluator.5" = type { %"struct.Eigen::TensorEvaluator", %"struct.Eigen::GpuDevice"*, float* }
%"struct.Eigen::internal::PtrWrapper" = type { float* }
%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer" = type { %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer" }
%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer" = type { %"struct.Eigen::internal::(anonymous namespace)::Identity" }
%"struct.Eigen::internal::(anonymous namespace)::Identity" = type { i8 }
%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer" = type { float }
%"struct.Eigen::TensorEvaluator.6" = type { %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.8" }
%"struct.Eigen::TensorEvaluator.7" = type { float*, %"struct.Eigen::DSizes", %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::TensorEvaluator.8" = type { %"struct.Eigen::TensorEvaluator", %"class.Eigen::TensorReductionOp", %"struct.Eigen::GpuDevice"*, float* }
%"class.Eigen::TensorReductionOp" = type <{ %"class.Eigen::TensorMap"*, %"class.Eigen::array.0", %"struct.Eigen::internal::SumReducer", [3 x i8] }>
%"class.Eigen::TensorMap" = type { float*, %"struct.Eigen::DSizes.4" }
%"struct.Eigen::TensorEvaluator.11" = type { %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator" }
%"struct.Eigen::TensorEvaluator.12" = type { %"class.Eigen::array", %"struct.Eigen::DSizes", %"class.Eigen::array.1", %"class.Eigen::array.0", %"class.Eigen::array.2", %"class.Eigen::array.0", %"class.Eigen::array.0", i32, %"class.Eigen::array.0", %"class.Eigen::array.0", %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::internal::SumReducer", float*, i64, %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::TensorEvaluator.13" = type { float*, %"struct.Eigen::DSizes.4", %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::TensorEvaluator.14" = type { %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::GpuDevice"*, float* }
%"struct.Eigen::TensorEvaluator.15" = type { %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.17" }
%"struct.Eigen::TensorEvaluator.16" = type { float*, %"struct.Eigen::DSizes", %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::TensorEvaluator.17" = type { %"struct.Eigen::TensorEvaluator.12", %"class.Eigen::TensorReductionOp.18", %"struct.Eigen::GpuDevice"*, float* }
%"class.Eigen::TensorReductionOp.18" = type <{ %"class.Eigen::TensorMap.20"*, %"class.Eigen::array.0", %"struct.Eigen::internal::SumReducer", [3 x i8] }>
%"class.Eigen::TensorMap.20" = type { float*, %"struct.Eigen::DSizes.4" }
%"struct.Eigen::TensorEvaluator.24" = type { %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.12" }
$_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_ = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
$_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv = comdat any
$_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE = comdat any
$_ZNK5Eigen8internal10SumReducerIfE10initializeEv = comdat any
$_ZN5Eigen6numext4miniIiEET_RKS2_S4_ = comdat any
$_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi = comdat any
$_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf = comdat any
$_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE = comdat any
$_ZN5Eigen8internal14scalar_cast_opIifEC1Ev = comdat any
$_ZNK5Eigen8internal14scalar_cast_opIifEclERKi = comdat any
$_ZN5Eigen8internal14scalar_cast_opIifEC2Ev = comdat any
$_ZN5Eigen8internal4castIifEET0_RKT_ = comdat any
$_ZN5Eigen8internal9cast_implIifE3runERKi = comdat any
$_Z5__ldgPKf = comdat any
$_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any
$_ZN5Eigen5divupIiijEET_T0_T1_ = comdat any
$_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any
$_ZN5Eigen5divupIiEET_S1_S1_ = comdat any
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any
$_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii = comdat any
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEED1Ev = comdat any
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi = comdat any
$_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEE5coeffEi = comdat any
$_ZN5Eigen8internal17GenericDimReducerILi0ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf = comdat any
$_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEE10firstInputEi = comdat any
$_ZNK5Eigen8internal10SumReducerIfE8finalizeEf = comdat any
$_ZNK5Eigen5arrayIiLm1EEixEm = comdat any
$_ZN5Eigen8internal17GenericDimReducerILin1ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf = comdat any
$_ZNK5Eigen5arrayINS_8internal16TensorIntDivisorIiLb0EEELm1EEixEm = comdat any
$_ZNK5Eigen8internal16TensorIntDivisorIiLb0EE6divideEi = comdat any
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEED2Ev = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_ = comdat any
$_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv = comdat any
$_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv = comdat any
$_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any
$_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii = comdat any
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEE10evalScalarEi = comdat any
$_ZNK5Eigen15TensorEvaluatorIKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE5coeffEi = comdat any
$_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi = comdat any
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any
$_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii = comdat any
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE = comdat any
$_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi = comdat any
$_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any
$_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any
$_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii = comdat any
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEED1Ev = comdat any
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi = comdat any
$_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEE5coeffEi = comdat any
$_ZN5Eigen8internal17GenericDimReducerILi0ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf = comdat any
$_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEE10firstInputEi = comdat any
$_ZN5Eigen8internal17GenericDimReducerILin1ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf = comdat any
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEED2Ev = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any
$_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii = comdat any
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEE10evalScalarEi = comdat any
$_ZNK5Eigen15TensorEvaluatorIKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE5coeffEi = comdat any
$_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi = comdat any
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any
$_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii = comdat any
$_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
@blockIdx = extern_weak addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@blockDim = extern_weak addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
@threadIdx = extern_weak addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@gridDim = extern_weak addrspace(1) global %struct.__cuda_builtin_gridDim_t, align 1
@.str = private unnamed_addr constant [24 x i8] c"blockDim.x == BLOCK_DIM\00", align 1
@.str.1 = private unnamed_addr constant [76 x i8] c"third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@.str.2 = private unnamed_addr constant [16 x i8] c"blockDim.y == 1\00", align 1
@.str.3 = private unnamed_addr constant [16 x i8] c"blockDim.z == 1\00", align 1
@.str.4 = private unnamed_addr constant [22 x i8] c"gridDim.x == GRID_DIM\00", align 1
@.str.5 = private unnamed_addr constant [15 x i8] c"gridDim.y == 1\00", align 1
@.str.6 = private unnamed_addr constant [15 x i8] c"gridDim.z == 1\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@"$str" = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_(float, i32, float*) #0 comdat {
%4 = alloca float, align 4
%5 = alloca i32, align 4
%6 = alloca float*, align 8
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
store float %0, float* %4, align 4
store i32 %1, i32* %5, align 4
store float* %2, float** %6, align 8
%10 = bitcast i32* %7 to i8*
call void @llvm.lifetime.start(i64 4, i8* %10) #9
%11 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%12 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%13 = mul i32 %11, %12
%14 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%15 = add i32 %13, %14
store i32 %15, i32* %7, align 4
%16 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %16) #9
%17 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%18 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%19 = mul i32 %17, %18
store i32 %19, i32* %8, align 4
%20 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %20) #9
%21 = load i32, i32* %7, align 4
store i32 %21, i32* %9, align 4
br label %22
; <label>:22: ; preds = %34, %3
%23 = load i32, i32* %9, align 4
%24 = load i32, i32* %5, align 4
%25 = icmp slt i32 %23, %24
br i1 %25, label %28, label %26
; <label>:26: ; preds = %22
%27 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %27) #9
br label %38
; <label>:28: ; preds = %22
%29 = load float, float* %4, align 4
%30 = load i32, i32* %9, align 4
%31 = sext i32 %30 to i64
%32 = load float*, float** %6, align 8
%33 = getelementptr inbounds float, float* %32, i64 %31
store float %29, float* %33, align 4
br label %34
; <label>:34: ; preds = %28
%35 = load i32, i32* %8, align 4
%36 = load i32, i32* %9, align 4
%37 = add nsw i32 %36, %35
store i32 %37, i32* %9, align 4
br label %22
; <label>:38: ; preds = %26
%39 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %39) #9
%40 = bitcast i32* %7 to i8*
call void @llvm.lifetime.end(i64 4, i8* %40) #9
ret void
}
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.start(i64, i8* nocapture) #1
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 comdat align 2 {
%1 = call i32 @llvm.ptx.read.ctaid.x()
ret i32 %1
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 comdat align 2 {
%1 = call i32 @llvm.ptx.read.ntid.x()
ret i32 %1
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 comdat align 2 {
%1 = call i32 @llvm.ptx.read.tid.x()
ret i32 %1
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #2 comdat align 2 {
%1 = call i32 @llvm.ptx.read.nctaid.x()
ret i32 %1
}
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.end(i64, i8* nocapture) #1
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.ctaid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.ntid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.tid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.nctaid.x() #3
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, float*) #0 comdat {
%5 = alloca i32, align 4
%6 = alloca float*, align 8
%7 = alloca i32, align 4
%8 = alloca float, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca float, align 4
%15 = alloca i32, align 4
store i32 %2, i32* %5, align 4
store float* %3, float** %6, align 8
%16 = bitcast i32* %7 to i8*
call void @llvm.lifetime.start(i64 4, i8* %16) #9
%17 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%18 = mul i32 %17, 256
%19 = mul i32 %18, 128
%20 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%21 = add i32 %19, %20
store i32 %21, i32* %7, align 4
%22 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%23 = icmp eq i32 %22, 1
br i1 %23, label %24, label %31
; <label>:24: ; preds = %4
%25 = load i32, i32* %7, align 4
%26 = icmp eq i32 %25, 0
br i1 %26, label %27, label %30
; <label>:27: ; preds = %24
%28 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10
%29 = load float*, float** %6, align 8
store float %28, float* %29, align 4
br label %30
; <label>:30: ; preds = %27, %24
call void @llvm.cuda.syncthreads()
br label %31
; <label>:31: ; preds = %30, %4
%32 = bitcast float* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %32) #9
%33 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10
store float %33, float* %8, align 4
%34 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %34) #9
%35 = load i32, i32* %5, align 4
%36 = load i32, i32* %7, align 4
%37 = sub nsw i32 %35, %36
store i32 %37, i32* %10, align 4
store i32 32768, i32* %11, align 4
%38 = call i32 @_ZN5Eigen6numext4miniIiEET_RKS2_S4_(i32* dereferenceable(4) %10, i32* dereferenceable(4) %11) #10
store i32 %38, i32* %9, align 4
%39 = bitcast i32* %12 to i8*
call void @llvm.lifetime.start(i64 4, i8* %39) #9
store i32 0, i32* %12, align 4
br label %40
; <label>:40: ; preds = %58, %31
%41 = load i32, i32* %12, align 4
%42 = load i32, i32* %9, align 4
%43 = icmp slt i32 %41, %42
br i1 %43, label %46, label %44
; <label>:44: ; preds = %40
%45 = bitcast i32* %12 to i8*
call void @llvm.lifetime.end(i64 4, i8* %45) #9
br label %61
; <label>:46: ; preds = %40
%47 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %47) #9
%48 = load i32, i32* %7, align 4
%49 = load i32, i32* %12, align 4
%50 = add nsw i32 %48, %49
store i32 %50, i32* %13, align 4
%51 = bitcast float* %14 to i8*
call void @llvm.lifetime.start(i64 4, i8* %51) #9
%52 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i32 0, i32 10
%53 = load i32, i32* %13, align 4
%54 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %52, i32 %53) #10
store float %54, float* %14, align 4
%55 = load float, float* %14, align 4
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %55, float* %8) #10
%56 = bitcast float* %14 to i8*
call void @llvm.lifetime.end(i64 4, i8* %56) #9
%57 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %57) #9
br label %58
; <label>:58: ; preds = %46
%59 = load i32, i32* %12, align 4
%60 = add nsw i32 %59, 256
store i32 %60, i32* %12, align 4
br label %40, !llvm.loop !46
; <label>:61: ; preds = %44
%62 = bitcast i32* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %62) #9
store i32 16, i32* %15, align 4
br label %63
; <label>:63: ; preds = %72, %61
%64 = load i32, i32* %15, align 4
%65 = icmp sgt i32 %64, 0
br i1 %65, label %68, label %66
; <label>:66: ; preds = %63
%67 = bitcast i32* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %67) #9
br label %75
; <label>:68: ; preds = %63
%69 = load float, float* %8, align 4
%70 = load i32, i32* %15, align 4
%71 = call float @_ZL11__shfl_downfji(float %69, i32 %70, i32 32) #10
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %71, float* %8) #10
br label %72
; <label>:72: ; preds = %68
%73 = load i32, i32* %15, align 4
%74 = sdiv i32 %73, 2
store i32 %74, i32* %15, align 4
br label %63, !llvm.loop !48
; <label>:75: ; preds = %66
%76 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%77 = and i32 %76, 31
%78 = icmp eq i32 %77, 0
br i1 %78, label %79, label %82
; <label>:79: ; preds = %75
%80 = load float*, float** %6, align 8
%81 = load float, float* %8, align 4
call void @_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE(float* %80, float %81, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %0) #10
br label %82
; <label>:82: ; preds = %79, %75
%83 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %83) #9
%84 = bitcast float* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %84) #9
%85 = bitcast i32* %7 to i8*
call void @llvm.lifetime.end(i64 4, i8* %85) #9
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"*) #4 comdat align 2 {
%2 = alloca %"struct.Eigen::internal::SumReducer"*, align 8
%3 = alloca %"struct.Eigen::internal::scalar_cast_op", align 1
%4 = alloca i32, align 4
store %"struct.Eigen::internal::SumReducer"* %0, %"struct.Eigen::internal::SumReducer"** %2, align 8
%5 = load %"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::internal::SumReducer"** %2, align 8
%6 = bitcast %"struct.Eigen::internal::scalar_cast_op"* %3 to i8*
call void @llvm.lifetime.start(i64 1, i8* %6) #9
call void @_ZN5Eigen8internal14scalar_cast_opIifEC1Ev(%"struct.Eigen::internal::scalar_cast_op"* %3) #10
store i32 0, i32* %4, align 4
%7 = call float @_ZNK5Eigen8internal14scalar_cast_opIifEclERKi(%"struct.Eigen::internal::scalar_cast_op"* %3, i32* dereferenceable(4) %4) #10
%8 = bitcast %"struct.Eigen::internal::scalar_cast_op"* %3 to i8*
call void @llvm.lifetime.end(i64 1, i8* %8) #9
ret float %7
}
; Function Attrs: convergent nounwind
declare void @llvm.cuda.syncthreads() #5
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr i32 @_ZN5Eigen6numext4miniIiEET_RKS2_S4_(i32* dereferenceable(4), i32* dereferenceable(4)) #2 comdat {
%3 = alloca i32*, align 8
%4 = alloca i32*, align 8
store i32* %0, i32** %3, align 8
store i32* %1, i32** %4, align 8
%5 = load i32*, i32** %4, align 8
%6 = load i32, i32* %5, align 4
%7 = load i32*, i32** %3, align 8
%8 = load i32, i32* %7, align 4
%9 = icmp slt i32 %6, %8
br i1 %9, label %10, label %13
; <label>:10: ; preds = %2
%11 = load i32*, i32** %4, align 8
%12 = load i32, i32* %11, align 4
br label %16
; <label>:13: ; preds = %2
%14 = load i32*, i32** %3, align 8
%15 = load i32, i32* %14, align 4
br label %16
; <label>:16: ; preds = %13, %10
%17 = phi i32 [ %12, %10 ], [ %15, %13 ]
ret i32 %17
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator.3"*, align 8
%4 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator.3"* %0, %"struct.Eigen::TensorEvaluator.3"** %3, align 8
store i32 %1, i32* %4, align 4
%5 = load %"struct.Eigen::TensorEvaluator.3"*, %"struct.Eigen::TensorEvaluator.3"** %3, align 8
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %5, i32 0, i32 0
%7 = load float*, float** %6, align 8
%8 = load i32, i32* %4, align 4
%9 = sext i32 %8 to i64
%10 = getelementptr inbounds float, float* %7, i64 %9
%11 = call float @_ZN5Eigen12_GLOBAL__N_112loadConstantIfEET_PKS2_(float* %10) #10
ret float %11
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"*, float, float*) #4 comdat align 2 {
%4 = alloca %"struct.Eigen::internal::SumReducer"*, align 8
%5 = alloca float, align 4
%6 = alloca float*, align 8
store %"struct.Eigen::internal::SumReducer"* %0, %"struct.Eigen::internal::SumReducer"** %4, align 8
store float %1, float* %5, align 4
store float* %2, float** %6, align 8
%7 = load %"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::internal::SumReducer"** %4, align 8
%8 = load float, float* %5, align 4
%9 = load float*, float** %6, align 8
%10 = load float, float* %9, align 4
%11 = fadd float %10, %8
store float %11, float* %9, align 4
ret void
}
; Function Attrs: convergent inlinehint nounwind
define internal float @_ZL11__shfl_downfji(float, i32, i32) #4 {
%4 = alloca float, align 4
%5 = alloca i32, align 4
%6 = alloca i32, align 4
%7 = alloca float, align 4
%8 = alloca i32, align 4
store float %0, float* %4, align 4
store i32 %1, i32* %5, align 4
store i32 %2, i32* %6, align 4
%9 = bitcast float* %7 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
%10 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %10) #9
%11 = load i32, i32* %6, align 4
%12 = sub nsw i32 32, %11
%13 = shl i32 %12, 8
%14 = or i32 %13, 31
store i32 %14, i32* %8, align 4
%15 = load float, float* %4, align 4
%16 = load i32, i32* %5, align 4
%17 = load i32, i32* %8, align 4
%18 = call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %15, i32 %16, i32 %17) #5, !srcloc !50
store float %18, float* %7, align 4
%19 = load float, float* %7, align 4
%20 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %20) #9
%21 = bitcast float* %7 to i8*
call void @llvm.lifetime.end(i64 4, i8* %21) #9
ret float %19
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE(float*, float, %"struct.Eigen::internal::SumReducer"* dereferenceable(1)) #4 comdat {
%4 = alloca float*, align 8
%5 = alloca float, align 4
%6 = alloca %"struct.Eigen::internal::SumReducer"*, align 8
store float* %0, float** %4, align 8
store float %1, float* %5, align 4
store %"struct.Eigen::internal::SumReducer"* %2, %"struct.Eigen::internal::SumReducer"** %6, align 8
%7 = load float*, float** %4, align 8
%8 = load float, float* %5, align 4
%9 = call float @_ZL9atomicAddPff(float* %7, float %8) #10
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen8internal14scalar_cast_opIifEC1Ev(%"struct.Eigen::internal::scalar_cast_op"*) unnamed_addr #4 comdat align 2 {
%2 = alloca %"struct.Eigen::internal::scalar_cast_op"*, align 8
store %"struct.Eigen::internal::scalar_cast_op"* %0, %"struct.Eigen::internal::scalar_cast_op"** %2, align 8
%3 = load %"struct.Eigen::internal::scalar_cast_op"*, %"struct.Eigen::internal::scalar_cast_op"** %2, align 8
call void @_ZN5Eigen8internal14scalar_cast_opIifEC2Ev(%"struct.Eigen::internal::scalar_cast_op"* %3) #10
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr float @_ZNK5Eigen8internal14scalar_cast_opIifEclERKi(%"struct.Eigen::internal::scalar_cast_op"*, i32* dereferenceable(4)) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::internal::scalar_cast_op"*, align 8
%4 = alloca i32*, align 8
store %"struct.Eigen::internal::scalar_cast_op"* %0, %"struct.Eigen::internal::scalar_cast_op"** %3, align 8
store i32* %1, i32** %4, align 8
%5 = load %"struct.Eigen::internal::scalar_cast_op"*, %"struct.Eigen::internal::scalar_cast_op"** %3, align 8
%6 = load i32*, i32** %4, align 8
%7 = call float @_ZN5Eigen8internal4castIifEET0_RKT_(i32* dereferenceable(4) %6) #10
ret float %7
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen8internal14scalar_cast_opIifEC2Ev(%"struct.Eigen::internal::scalar_cast_op"*) unnamed_addr #4 comdat align 2 {
%2 = alloca %"struct.Eigen::internal::scalar_cast_op"*, align 8
store %"struct.Eigen::internal::scalar_cast_op"* %0, %"struct.Eigen::internal::scalar_cast_op"** %2, align 8
%3 = load %"struct.Eigen::internal::scalar_cast_op"*, %"struct.Eigen::internal::scalar_cast_op"** %2, align 8
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr float @_ZN5Eigen8internal4castIifEET0_RKT_(i32* dereferenceable(4)) #4 comdat {
%2 = alloca i32*, align 8
store i32* %0, i32** %2, align 8
%3 = load i32*, i32** %2, align 8
%4 = call float @_ZN5Eigen8internal9cast_implIifE3runERKi(i32* dereferenceable(4) %3) #10
ret float %4
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr float @_ZN5Eigen8internal9cast_implIifE3runERKi(i32* dereferenceable(4)) #4 comdat align 2 {
%2 = alloca i32*, align 8
store i32* %0, i32** %2, align 8
%3 = load i32*, i32** %2, align 8
%4 = load i32, i32* %3, align 4
%5 = sitofp i32 %4 to float
ret float %5
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define internal float @_ZN5Eigen12_GLOBAL__N_112loadConstantIfEET_PKS2_(float*) #2 {
%2 = alloca float*, align 8
store float* %0, float** %2, align 8
%3 = load float*, float** %2, align 8
%4 = call float @_Z5__ldgPKf(float* %3) #10
ret float %4
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr float @_Z5__ldgPKf(float*) #4 comdat {
%2 = alloca float*, align 8
store float* %0, float** %2, align 8
%3 = load float*, float** %2, align 8
%4 = call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %3, i32 4)
ret float %4
}
; Function Attrs: argmemonly nounwind readonly
declare float @llvm.nvvm.ldg.global.f.f32.p0f32(float* nocapture, i32) #6
; Function Attrs: convergent inlinehint nounwind
define internal float @_ZL9atomicAddPff(float*, float) #4 {
%3 = alloca float*, align 8
%4 = alloca float, align 4
store float* %0, float** %3, align 8
store float %1, float* %4, align 4
%5 = load float*, float** %3, align 8
%6 = load float, float* %4, align 4
%7 = call float @_ZL12__fAtomicAddPff(float* %5, float %6) #10
ret float %7
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define internal float @_ZL12__fAtomicAddPff(float*, float) #2 {
%3 = alloca float*, align 8
%4 = alloca float, align 4
store float* %0, float** %3, align 8
store float %1, float* %4, align 4
%5 = load float*, float** %3, align 8
%6 = load float, float* %4, align 4
%7 = call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %5, float %6)
ret float %7
}
; Function Attrs: argmemonly nounwind
declare float @llvm.nvvm.atomic.load.add.f32.p0f32(float* nocapture, float) #1
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, i32, float*) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca float*, align 8
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca i32, align 4
%15 = alloca i32, align 4
%16 = alloca i32
%17 = alloca i32, align 4
%18 = alloca i32, align 4
%19 = alloca i32, align 4
%20 = alloca float, align 4
%21 = alloca i32, align 4
%22 = alloca i32, align 4
%23 = alloca i32, align 4
%24 = alloca float, align 4
%25 = alloca i32, align 4
%26 = alloca i32, align 4
%27 = alloca i32, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
store float* %4, float** %8, align 8
%28 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %28) #9
store i32 16, i32* %9, align 4
%29 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %29) #9
%30 = load i32, i32* %6, align 4
%31 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%32 = mul i32 %31, 128
%33 = call i32 @_ZN5Eigen5divupIiijEET_T0_T1_(i32 %30, i32 %32) #10
store i32 %33, i32* %10, align 4
%34 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %34) #9
%35 = load i32, i32* %10, align 4
%36 = load i32, i32* %7, align 4
%37 = mul nsw i32 %35, %36
store i32 %37, i32* %11, align 4
%38 = bitcast i32* %12 to i8*
call void @llvm.lifetime.start(i64 4, i8* %38) #9
%39 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%40 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%41 = mul i32 %39, %40
store i32 %41, i32* %12, align 4
%42 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %42) #9
%43 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%44 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%45 = mul i32 %43, %44
%46 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%47 = add i32 %45, %46
store i32 %47, i32* %13, align 4
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%49 = icmp eq i32 %48, 1
br i1 %49, label %50, label %70
; <label>:50: ; preds = %5
%51 = bitcast i32* %14 to i8*
call void @llvm.lifetime.start(i64 4, i8* %51) #9
%52 = load i32, i32* %13, align 4
store i32 %52, i32* %14, align 4
br label %53
; <label>:53: ; preds = %65, %50
%54 = load i32, i32* %14, align 4
%55 = load i32, i32* %7, align 4
%56 = icmp slt i32 %54, %55
br i1 %56, label %59, label %57
; <label>:57: ; preds = %53
%58 = bitcast i32* %14 to i8*
call void @llvm.lifetime.end(i64 4, i8* %58) #9
br label %69
; <label>:59: ; preds = %53
%60 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10
%61 = load i32, i32* %14, align 4
%62 = sext i32 %61 to i64
%63 = load float*, float** %8, align 8
%64 = getelementptr inbounds float, float* %63, i64 %62
store float %60, float* %64, align 4
br label %65
; <label>:65: ; preds = %59
%66 = load i32, i32* %12, align 4
%67 = load i32, i32* %14, align 4
%68 = add nsw i32 %67, %66
store i32 %68, i32* %14, align 4
br label %53
; <label>:69: ; preds = %57
br label %70
; <label>:70: ; preds = %69, %5
%71 = bitcast i32* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %71) #9
%72 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %72, i32* %15, align 4
br label %73
; <label>:73: ; preds = %215, %70
%74 = load i32, i32* %15, align 4
%75 = load i32, i32* %11, align 4
%76 = icmp slt i32 %74, %75
br i1 %76, label %79, label %77
; <label>:77: ; preds = %73
store i32 5, i32* %16, align 4
%78 = bitcast i32* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %78) #9
br label %219
; <label>:79: ; preds = %73
%80 = bitcast i32* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %80) #9
%81 = load i32, i32* %15, align 4
%82 = load i32, i32* %10, align 4
%83 = sdiv i32 %81, %82
store i32 %83, i32* %17, align 4
%84 = load i32, i32* %17, align 4
%85 = load i32, i32* %7, align 4
%86 = icmp slt i32 %84, %85
br i1 %86, label %87, label %213
; <label>:87: ; preds = %79
%88 = bitcast i32* %18 to i8*
call void @llvm.lifetime.start(i64 4, i8* %88) #9
%89 = load i32, i32* %15, align 4
%90 = load i32, i32* %10, align 4
%91 = srem i32 %89, %90
store i32 %91, i32* %18, align 4
%92 = bitcast i32* %19 to i8*
call void @llvm.lifetime.start(i64 4, i8* %92) #9
%93 = load i32, i32* %18, align 4
%94 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%95 = mul i32 %93, %94
%96 = mul i32 %95, 128
%97 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%98 = add i32 %96, %97
store i32 %98, i32* %19, align 4
%99 = bitcast float* %20 to i8*
call void @llvm.lifetime.start(i64 4, i8* %99) #9
%100 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10
store float %100, float* %20, align 4
%101 = bitcast i32* %21 to i8*
call void @llvm.lifetime.start(i64 4, i8* %101) #9
store i32 0, i32* %21, align 4
br label %102
; <label>:102: ; preds = %180, %87
%103 = load i32, i32* %21, align 4
%104 = icmp slt i32 %103, 128
br i1 %104, label %106, label %105
; <label>:105: ; preds = %102
store i32 8, i32* %16, align 4
br label %183
; <label>:106: ; preds = %102
%107 = bitcast i32* %22 to i8*
call void @llvm.lifetime.start(i64 4, i8* %107) #9
%108 = load i32, i32* %19, align 4
%109 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%110 = load i32, i32* %21, align 4
%111 = add nsw i32 %110, 16
%112 = sub nsw i32 %111, 1
%113 = mul i32 %109, %112
%114 = add i32 %108, %113
store i32 %114, i32* %22, align 4
%115 = load i32, i32* %22, align 4
%116 = load i32, i32* %6, align 4
%117 = icmp sge i32 %115, %116
br i1 %117, label %118, label %147
; <label>:118: ; preds = %106
%119 = bitcast i32* %23 to i8*
call void @llvm.lifetime.start(i64 4, i8* %119) #9
%120 = load i32, i32* %19, align 4
%121 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%122 = load i32, i32* %21, align 4
%123 = mul i32 %121, %122
%124 = add i32 %120, %123
store i32 %124, i32* %23, align 4
br label %125
; <label>:125: ; preds = %142, %118
%126 = load i32, i32* %23, align 4
%127 = load i32, i32* %6, align 4
%128 = icmp slt i32 %126, %127
br i1 %128, label %131, label %129
; <label>:129: ; preds = %125
store i32 11, i32* %16, align 4
%130 = bitcast i32* %23 to i8*
call void @llvm.lifetime.end(i64 4, i8* %130) #9
br label %146
; <label>:131: ; preds = %125
%132 = bitcast float* %24 to i8*
call void @llvm.lifetime.start(i64 4, i8* %132) #9
%133 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i32 0, i32 10
%134 = load i32, i32* %17, align 4
%135 = load i32, i32* %6, align 4
%136 = mul nsw i32 %134, %135
%137 = load i32, i32* %23, align 4
%138 = add nsw i32 %136, %137
%139 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %133, i32 %138) #10
store float %139, float* %24, align 4
%140 = load float, float* %24, align 4
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %140, float* %20) #10
%141 = bitcast float* %24 to i8*
call void @llvm.lifetime.end(i64 4, i8* %141) #9
br label %142
; <label>:142: ; preds = %131
%143 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%144 = load i32, i32* %23, align 4
%145 = add i32 %144, %143
store i32 %145, i32* %23, align 4
br label %125
; <label>:146: ; preds = %129
store i32 8, i32* %16, align 4
br label %176
; <label>:147: ; preds = %106
%148 = bitcast i32* %25 to i8*
call void @llvm.lifetime.start(i64 4, i8* %148) #9
store i32 0, i32* %25, align 4
br label %149
; <label>:149: ; preds = %171, %147
%150 = load i32, i32* %25, align 4
%151 = icmp slt i32 %150, 16
br i1 %151, label %154, label %152
; <label>:152: ; preds = %149
store i32 14, i32* %16, align 4
%153 = bitcast i32* %25 to i8*
call void @llvm.lifetime.end(i64 4, i8* %153) #9
br label %174
; <label>:154: ; preds = %149
%155 = bitcast i32* %26 to i8*
call void @llvm.lifetime.start(i64 4, i8* %155) #9
%156 = load i32, i32* %19, align 4
%157 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%158 = load i32, i32* %21, align 4
%159 = load i32, i32* %25, align 4
%160 = add nsw i32 %158, %159
%161 = mul i32 %157, %160
%162 = add i32 %156, %161
store i32 %162, i32* %26, align 4
%163 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i32 0, i32 10
%164 = load i32, i32* %17, align 4
%165 = load i32, i32* %6, align 4
%166 = mul nsw i32 %164, %165
%167 = load i32, i32* %26, align 4
%168 = add nsw i32 %166, %167
%169 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %163, i32 %168) #10
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %169, float* %20) #10
%170 = bitcast i32* %26 to i8*
call void @llvm.lifetime.end(i64 4, i8* %170) #9
br label %171
; <label>:171: ; preds = %154
%172 = load i32, i32* %25, align 4
%173 = add nsw i32 %172, 1
store i32 %173, i32* %25, align 4
br label %149, !llvm.loop !51
; <label>:174: ; preds = %152
br label %175
; <label>:175: ; preds = %174
store i32 0, i32* %16, align 4
br label %176
; <label>:176: ; preds = %175, %146
%177 = bitcast i32* %22 to i8*
call void @llvm.lifetime.end(i64 4, i8* %177) #9
%178 = load i32, i32* %16, align 4
switch i32 %178, label %183 [
i32 0, label %179
]
; <label>:179: ; preds = %176
br label %180
; <label>:180: ; preds = %179
%181 = load i32, i32* %21, align 4
%182 = add nsw i32 %181, 16
store i32 %182, i32* %21, align 4
br label %102
; <label>:183: ; preds = %176, %105
%184 = bitcast i32* %21 to i8*
call void @llvm.lifetime.end(i64 4, i8* %184) #9
br label %185
; <label>:185: ; preds = %183
call void @llvm.cuda.syncthreads()
%186 = bitcast i32* %27 to i8*
call void @llvm.lifetime.start(i64 4, i8* %186) #9
store i32 16, i32* %27, align 4
br label %187
; <label>:187: ; preds = %196, %185
%188 = load i32, i32* %27, align 4
%189 = icmp sgt i32 %188, 0
br i1 %189, label %192, label %190
; <label>:190: ; preds = %187
store i32 17, i32* %16, align 4
%191 = bitcast i32* %27 to i8*
call void @llvm.lifetime.end(i64 4, i8* %191) #9
br label %199
; <label>:192: ; preds = %187
%193 = load float, float* %20, align 4
%194 = load i32, i32* %27, align 4
%195 = call float @_ZL11__shfl_downfji(float %193, i32 %194, i32 32) #10
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %195, float* %20) #10
br label %196
; <label>:196: ; preds = %192
%197 = load i32, i32* %27, align 4
%198 = sdiv i32 %197, 2
store i32 %198, i32* %27, align 4
br label %187
; <label>:199: ; preds = %190
%200 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%201 = and i32 %200, 31
%202 = icmp eq i32 %201, 0
br i1 %202, label %203, label %209
; <label>:203: ; preds = %199
%204 = load i32, i32* %17, align 4
%205 = sext i32 %204 to i64
%206 = load float*, float** %8, align 8
%207 = getelementptr inbounds float, float* %206, i64 %205
%208 = load float, float* %20, align 4
call void @_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE(float* %207, float %208, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %0) #10
br label %209
; <label>:209: ; preds = %203, %199
%210 = bitcast float* %20 to i8*
call void @llvm.lifetime.end(i64 4, i8* %210) #9
%211 = bitcast i32* %19 to i8*
call void @llvm.lifetime.end(i64 4, i8* %211) #9
%212 = bitcast i32* %18 to i8*
call void @llvm.lifetime.end(i64 4, i8* %212) #9
br label %213
; <label>:213: ; preds = %209, %79
call void @llvm.cuda.syncthreads()
%214 = bitcast i32* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %214) #9
br label %215
; <label>:215: ; preds = %213
%216 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%217 = load i32, i32* %15, align 4
%218 = add i32 %217, %216
store i32 %218, i32* %15, align 4
br label %73
; <label>:219: ; preds = %77
%220 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %220) #9
%221 = bitcast i32* %12 to i8*
call void @llvm.lifetime.end(i64 4, i8* %221) #9
%222 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %222) #9
%223 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %223) #9
%224 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %224) #9
ret void
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr i32 @_ZN5Eigen5divupIiijEET_T0_T1_(i32, i32) #2 comdat {
%3 = alloca i32, align 4
%4 = alloca i32, align 4
store i32 %0, i32* %3, align 4
store i32 %1, i32* %4, align 4
%5 = load i32, i32* %3, align 4
%6 = load i32, i32* %4, align 4
%7 = add i32 %5, %6
%8 = sub i32 %7, 1
%9 = load i32, i32* %4, align 4
%10 = udiv i32 %8, %9
ret i32 %10
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, i32, float*) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca float*, align 8
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca i32
%15 = alloca i32, align 4
%16 = alloca i32, align 4
%17 = alloca float, align 4
%18 = alloca i32, align 4
%19 = alloca i32, align 4
%20 = alloca i32, align 4
%21 = alloca float, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
store float* %4, float** %8, align 8
%22 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %22) #9
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%24 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%25 = mul i32 %23, %24
store i32 %25, i32* %9, align 4
%26 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %26) #9
%27 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%29 = mul i32 %27, %28
%30 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%31 = add i32 %29, %30
store i32 %31, i32* %10, align 4
%32 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%33 = icmp eq i32 %32, 1
br i1 %33, label %34, label %54
; <label>:34: ; preds = %5
%35 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %35) #9
%36 = load i32, i32* %10, align 4
store i32 %36, i32* %11, align 4
br label %37
; <label>:37: ; preds = %49, %34
%38 = load i32, i32* %11, align 4
%39 = load i32, i32* %7, align 4
%40 = icmp slt i32 %38, %39
br i1 %40, label %43, label %41
; <label>:41: ; preds = %37
%42 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %42) #9
br label %53
; <label>:43: ; preds = %37
%44 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10
%45 = load i32, i32* %11, align 4
%46 = sext i32 %45 to i64
%47 = load float*, float** %8, align 8
%48 = getelementptr inbounds float, float* %47, i64 %46
store float %44, float* %48, align 4
br label %49
; <label>:49: ; preds = %43
%50 = load i32, i32* %9, align 4
%51 = load i32, i32* %11, align 4
%52 = add nsw i32 %51, %50
store i32 %52, i32* %11, align 4
br label %37
; <label>:53: ; preds = %41
call void @llvm.cuda.syncthreads()
br label %54
; <label>:54: ; preds = %53, %5
%55 = bitcast i32* %12 to i8*
call void @llvm.lifetime.start(i64 4, i8* %55) #9
%56 = load i32, i32* %7, align 4
%57 = load i32, i32* %6, align 4
%58 = call i32 @_ZN5Eigen5divupIiEET_S1_S1_(i32 %57, i32 16) #10
%59 = mul nsw i32 %56, %58
store i32 %59, i32* %12, align 4
%60 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %60) #9
%61 = load i32, i32* %10, align 4
store i32 %61, i32* %13, align 4
br label %62
; <label>:62: ; preds = %116, %54
%63 = load i32, i32* %13, align 4
%64 = load i32, i32* %12, align 4
%65 = icmp slt i32 %63, %64
br i1 %65, label %68, label %66
; <label>:66: ; preds = %62
store i32 5, i32* %14, align 4
%67 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %67) #9
br label %120
; <label>:68: ; preds = %62
%69 = bitcast i32* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %69) #9
%70 = load i32, i32* %13, align 4
%71 = load i32, i32* %7, align 4
%72 = srem i32 %70, %71
store i32 %72, i32* %15, align 4
%73 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %73) #9
%74 = load i32, i32* %13, align 4
%75 = load i32, i32* %7, align 4
%76 = sdiv i32 %74, %75
%77 = mul nsw i32 %76, 16
store i32 %77, i32* %16, align 4
%78 = bitcast float* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %78) #9
%79 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10
store float %79, float* %17, align 4
%80 = bitcast i32* %18 to i8*
call void @llvm.lifetime.start(i64 4, i8* %80) #9
%81 = load i32, i32* %16, align 4
%82 = add nsw i32 %81, 16
store i32 %82, i32* %19, align 4
%83 = call i32 @_ZN5Eigen6numext4miniIiEET_RKS2_S4_(i32* dereferenceable(4) %19, i32* dereferenceable(4) %6) #10
store i32 %83, i32* %18, align 4
%84 = bitcast i32* %20 to i8*
call void @llvm.lifetime.start(i64 4, i8* %84) #9
%85 = load i32, i32* %16, align 4
store i32 %85, i32* %20, align 4
br label %86
; <label>:86: ; preds = %103, %68
%87 = load i32, i32* %20, align 4
%88 = load i32, i32* %18, align 4
%89 = icmp slt i32 %87, %88
br i1 %89, label %92, label %90
; <label>:90: ; preds = %86
store i32 8, i32* %14, align 4
%91 = bitcast i32* %20 to i8*
call void @llvm.lifetime.end(i64 4, i8* %91) #9
br label %106
; <label>:92: ; preds = %86
%93 = bitcast float* %21 to i8*
call void @llvm.lifetime.start(i64 4, i8* %93) #9
%94 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i32 0, i32 10
%95 = load i32, i32* %20, align 4
%96 = load i32, i32* %7, align 4
%97 = mul nsw i32 %95, %96
%98 = load i32, i32* %15, align 4
%99 = add nsw i32 %97, %98
%100 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %94, i32 %99) #10
store float %100, float* %21, align 4
%101 = load float, float* %21, align 4
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %101, float* %17) #10
%102 = bitcast float* %21 to i8*
call void @llvm.lifetime.end(i64 4, i8* %102) #9
br label %103
; <label>:103: ; preds = %92
%104 = load i32, i32* %20, align 4
%105 = add nsw i32 %104, 1
store i32 %105, i32* %20, align 4
br label %86
; <label>:106: ; preds = %90
%107 = load i32, i32* %15, align 4
%108 = sext i32 %107 to i64
%109 = load float*, float** %8, align 8
%110 = getelementptr inbounds float, float* %109, i64 %108
%111 = load float, float* %17, align 4
call void @_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE(float* %110, float %111, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %0) #10
%112 = bitcast i32* %18 to i8*
call void @llvm.lifetime.end(i64 4, i8* %112) #9
%113 = bitcast float* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %113) #9
%114 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %114) #9
%115 = bitcast i32* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %115) #9
br label %116
; <label>:116: ; preds = %106
%117 = load i32, i32* %9, align 4
%118 = load i32, i32* %13, align 4
%119 = add nsw i32 %118, %117
store i32 %119, i32* %13, align 4
br label %62
; <label>:120: ; preds = %66
%121 = bitcast i32* %12 to i8*
call void @llvm.lifetime.end(i64 4, i8* %121) #9
%122 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %122) #9
%123 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %123) #9
ret void
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr i32 @_ZN5Eigen5divupIiEET_S1_S1_(i32, i32) #2 comdat {
%3 = alloca i32, align 4
%4 = alloca i32, align 4
store i32 %0, i32* %3, align 4
store i32 %1, i32* %4, align 4
%5 = load i32, i32* %3, align 4
%6 = load i32, i32* %4, align 4
%7 = add nsw i32 %5, %6
%8 = sub nsw i32 %7, 1
%9 = load i32, i32* %4, align 4
%10 = sdiv i32 %8, %9
ret i32 %10
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.5"* byval align 8, i32) #0 comdat {
%3 = alloca i32, align 4
%4 = alloca i32, align 4
%5 = alloca i32, align 4
%6 = alloca %"struct.Eigen::TensorEvaluator.5", align 8
%7 = alloca i8, align 1
%8 = alloca %"struct.Eigen::TensorEvaluator.5", align 8
store i32 %1, i32* %3, align 4
%9 = bitcast i32* %4 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
%10 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%11 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%12 = mul i32 %10, %11
%13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%14 = add i32 %12, %13
store i32 %14, i32* %4, align 4
%15 = bitcast i32* %5 to i8*
call void @llvm.lifetime.start(i64 4, i8* %15) #9
%16 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%17 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%18 = mul i32 %16, %17
store i32 %18, i32* %5, align 4
%19 = bitcast %"struct.Eigen::TensorEvaluator.5"* %6 to i8*
call void @llvm.lifetime.start(i64 128, i8* %19) #9
%20 = bitcast %"struct.Eigen::TensorEvaluator.5"* %6 to i8*
%21 = bitcast %"struct.Eigen::TensorEvaluator.5"* %0 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %20, i8* %21, i64 128, i32 8, i1 false)
call void @llvm.lifetime.start(i64 1, i8* %7) #9
store i8 0, i8* %7, align 1
%22 = bitcast %"struct.Eigen::TensorEvaluator.5"* %8 to i8*
%23 = bitcast %"struct.Eigen::TensorEvaluator.5"* %6 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %22, i8* %23, i64 128, i32 8, i1 false)
%24 = load i32, i32* %4, align 4
%25 = load i32, i32* %3, align 4
%26 = load i32, i32* %5, align 4
call void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii(%"struct.Eigen::TensorEvaluator.5"* byval align 8 %8, i32 %24, i32 %25, i32 %26) #10
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEED1Ev(%"struct.Eigen::TensorEvaluator.5"* %8) #5
call void @llvm.lifetime.end(i64 1, i8* %7) #9
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEED1Ev(%"struct.Eigen::TensorEvaluator.5"* %6) #5
%27 = bitcast %"struct.Eigen::TensorEvaluator.5"* %6 to i8*
call void @llvm.lifetime.end(i64 128, i8* %27) #9
%28 = bitcast i32* %5 to i8*
call void @llvm.lifetime.end(i64 4, i8* %28) #9
%29 = bitcast i32* %4 to i8*
call void @llvm.lifetime.end(i64 4, i8* %29) #9
ret void
}
; Function Attrs: argmemonly nounwind
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii(%"struct.Eigen::TensorEvaluator.5"* byval align 8, i32, i32, i32) #2 comdat align 2 {
%5 = alloca i32, align 4
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
store i32 %1, i32* %5, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%9 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
%10 = load i32, i32* %5, align 4
store i32 %10, i32* %8, align 4
br label %11
; <label>:11: ; preds = %19, %4
%12 = load i32, i32* %8, align 4
%13 = load i32, i32* %6, align 4
%14 = icmp slt i32 %12, %13
br i1 %14, label %17, label %15
; <label>:15: ; preds = %11
%16 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %16) #9
br label %23
; <label>:17: ; preds = %11
%18 = load i32, i32* %8, align 4
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.5"* %0, i32 %18) #10
br label %19
; <label>:19: ; preds = %17
%20 = load i32, i32* %7, align 4
%21 = load i32, i32* %8, align 4
%22 = add nsw i32 %21, %20
store i32 %22, i32* %8, align 4
br label %11
; <label>:23: ; preds = %15
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEED1Ev(%"struct.Eigen::TensorEvaluator.5"*) unnamed_addr #4 comdat align 2 {
%2 = alloca %"struct.Eigen::TensorEvaluator.5"*, align 8
store %"struct.Eigen::TensorEvaluator.5"* %0, %"struct.Eigen::TensorEvaluator.5"** %2, align 8
%3 = load %"struct.Eigen::TensorEvaluator.5"*, %"struct.Eigen::TensorEvaluator.5"** %2, align 8
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEED2Ev(%"struct.Eigen::TensorEvaluator.5"* %3) #5
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.5"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator.5"*, align 8
%4 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator.5"* %0, %"struct.Eigen::TensorEvaluator.5"** %3, align 8
store i32 %1, i32* %4, align 4
%5 = load %"struct.Eigen::TensorEvaluator.5"*, %"struct.Eigen::TensorEvaluator.5"** %3, align 8
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %5, i32 0, i32 0
%7 = load i32, i32* %4, align 4
%8 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator"* %6, i32 %7) #10
%9 = load i32, i32* %4, align 4
%10 = sext i32 %9 to i64
%11 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %5, i32 0, i32 2
%12 = load float*, float** %11, align 8
%13 = getelementptr inbounds float, float* %12, i64 %10
store float %8, float* %13, align 4
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator"*, align 8
%4 = alloca i32, align 4
%5 = alloca %"struct.Eigen::internal::SumReducer", align 1
%6 = alloca float, align 4
store %"struct.Eigen::TensorEvaluator"* %0, %"struct.Eigen::TensorEvaluator"** %3, align 8
store i32 %1, i32* %4, align 4
%7 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %3, align 8
%8 = bitcast %"struct.Eigen::internal::SumReducer"* %5 to i8*
call void @llvm.lifetime.start(i64 1, i8* %8) #9
%9 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %7, i32 0, i32 11
%10 = bitcast float* %6 to i8*
call void @llvm.lifetime.start(i64 4, i8* %10) #9
%11 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %5) #10
store float %11, float* %6, align 4
%12 = load i32, i32* %4, align 4
%13 = call i32 @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEE10firstInputEi(%"struct.Eigen::TensorEvaluator"* %7, i32 %12) #10
call void @_ZN5Eigen8internal17GenericDimReducerILi0ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator"* dereferenceable(112) %7, i32 %13, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %5, float* %6) #10
%14 = load float, float* %6, align 4
%15 = call float @_ZNK5Eigen8internal10SumReducerIfE8finalizeEf(%"struct.Eigen::internal::SumReducer"* %5, float %14) #10
%16 = bitcast float* %6 to i8*
call void @llvm.lifetime.end(i64 4, i8* %16) #9
%17 = bitcast %"struct.Eigen::internal::SumReducer"* %5 to i8*
call void @llvm.lifetime.end(i64 1, i8* %17) #9
ret float %15
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen8internal17GenericDimReducerILi0ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator"* dereferenceable(112), i32, %"struct.Eigen::internal::SumReducer"* dereferenceable(1), float*) #4 comdat align 2 {
%5 = alloca %"struct.Eigen::TensorEvaluator"*, align 8
%6 = alloca i32, align 4
%7 = alloca %"struct.Eigen::internal::SumReducer"*, align 8
%8 = alloca float*, align 8
%9 = alloca i32, align 4
%10 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator"* %0, %"struct.Eigen::TensorEvaluator"** %5, align 8
store i32 %1, i32* %6, align 4
store %"struct.Eigen::internal::SumReducer"* %2, %"struct.Eigen::internal::SumReducer"** %7, align 8
store float* %3, float** %8, align 8
%11 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %11) #9
store i32 0, i32* %9, align 4
br label %12
; <label>:12: ; preds = %36, %4
%13 = load i32, i32* %9, align 4
%14 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %5, align 8
%15 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %14, i32 0, i32 9
%16 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %15, i64 0) #10
%17 = load i32, i32* %16, align 4
%18 = icmp slt i32 %13, %17
br i1 %18, label %21, label %19
; <label>:19: ; preds = %12
%20 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %20) #9
br label %39
; <label>:21: ; preds = %12
%22 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %22) #9
%23 = load i32, i32* %6, align 4
%24 = load i32, i32* %9, align 4
%25 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %5, align 8
%26 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %25, i32 0, i32 8
%27 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %26, i64 0) #10
%28 = load i32, i32* %27, align 4
%29 = mul nsw i32 %24, %28
%30 = add nsw i32 %23, %29
store i32 %30, i32* %10, align 4
%31 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %5, align 8
%32 = load i32, i32* %10, align 4
%33 = load %"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::internal::SumReducer"** %7, align 8
%34 = load float*, float** %8, align 8
call void @_ZN5Eigen8internal17GenericDimReducerILin1ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator"* dereferenceable(112) %31, i32 %32, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %33, float* %34) #10
%35 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %35) #9
br label %36
; <label>:36: ; preds = %21
%37 = load i32, i32* %9, align 4
%38 = add nsw i32 %37, 1
store i32 %38, i32* %9, align 4
br label %12
; <label>:39: ; preds = %19
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr i32 @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEE10firstInputEi(%"struct.Eigen::TensorEvaluator"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator"*, align 8
%4 = alloca i32, align 4
%5 = alloca i32, align 4
%6 = alloca i32, align 4
%7 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator"* %0, %"struct.Eigen::TensorEvaluator"** %3, align 8
store i32 %1, i32* %4, align 4
%8 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %3, align 8
%9 = bitcast i32* %5 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
store i32 0, i32* %5, align 4
%10 = bitcast i32* %6 to i8*
call void @llvm.lifetime.start(i64 4, i8* %10) #9
store i32 0, i32* %6, align 4
br label %11
; <label>:11: ; preds = %42, %2
%12 = load i32, i32* %6, align 4
%13 = icmp slt i32 %12, 0
br i1 %13, label %16, label %14
; <label>:14: ; preds = %11
%15 = bitcast i32* %6 to i8*
call void @llvm.lifetime.end(i64 4, i8* %15) #9
br label %45
; <label>:16: ; preds = %11
%17 = bitcast i32* %7 to i8*
call void @llvm.lifetime.start(i64 4, i8* %17) #9
%18 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %8, i32 0, i32 4
%19 = load i32, i32* %6, align 4
%20 = sext i32 %19 to i64
%21 = call dereferenceable(12) %"struct.Eigen::internal::TensorIntDivisor"* @_ZNK5Eigen5arrayINS_8internal16TensorIntDivisorIiLb0EEELm1EEixEm(%"class.Eigen::array.2"* %18, i64 %20) #10
%22 = call i32 @_ZN5Eigen8internaldvIiLb0EEET_RKS2_RKNS0_16TensorIntDivisorIS2_XT0_EEE(i32* dereferenceable(4) %4, %"struct.Eigen::internal::TensorIntDivisor"* dereferenceable(12) %21) #10
store i32 %22, i32* %7, align 4
%23 = load i32, i32* %7, align 4
%24 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %8, i32 0, i32 5
%25 = load i32, i32* %6, align 4
%26 = sext i32 %25 to i64
%27 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %24, i64 %26) #10
%28 = load i32, i32* %27, align 4
%29 = mul nsw i32 %23, %28
%30 = load i32, i32* %5, align 4
%31 = add nsw i32 %30, %29
store i32 %31, i32* %5, align 4
%32 = load i32, i32* %7, align 4
%33 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %8, i32 0, i32 3
%34 = load i32, i32* %6, align 4
%35 = sext i32 %34 to i64
%36 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %33, i64 %35) #10
%37 = load i32, i32* %36, align 4
%38 = mul nsw i32 %32, %37
%39 = load i32, i32* %4, align 4
%40 = sub nsw i32 %39, %38
store i32 %40, i32* %4, align 4
%41 = bitcast i32* %7 to i8*
call void @llvm.lifetime.end(i64 4, i8* %41) #9
br label %42
; <label>:42: ; preds = %16
%43 = load i32, i32* %6, align 4
%44 = add nsw i32 %43, 1
store i32 %44, i32* %6, align 4
br label %11
; <label>:45: ; preds = %14
%46 = load i32, i32* %4, align 4
%47 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %8, i32 0, i32 7
%48 = load i32, i32* %47, align 8
%49 = mul nsw i32 %46, %48
%50 = load i32, i32* %5, align 4
%51 = add nsw i32 %50, %49
store i32 %51, i32* %5, align 4
%52 = load i32, i32* %5, align 4
%53 = bitcast i32* %5 to i8*
call void @llvm.lifetime.end(i64 4, i8* %53) #9
ret i32 %52
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr float @_ZNK5Eigen8internal10SumReducerIfE8finalizeEf(%"struct.Eigen::internal::SumReducer"*, float) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::internal::SumReducer"*, align 8
%4 = alloca float, align 4
store %"struct.Eigen::internal::SumReducer"* %0, %"struct.Eigen::internal::SumReducer"** %3, align 8
store float %1, float* %4, align 4
%5 = load %"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::internal::SumReducer"** %3, align 8
%6 = load float, float* %4, align 4
ret float %6
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"*, i64) #4 comdat align 2 {
%3 = alloca %"class.Eigen::array.0"*, align 8
%4 = alloca i64, align 8
store %"class.Eigen::array.0"* %0, %"class.Eigen::array.0"** %3, align 8
store i64 %1, i64* %4, align 8
%5 = load %"class.Eigen::array.0"*, %"class.Eigen::array.0"** %3, align 8
%6 = load i64, i64* %4, align 8
%7 = getelementptr inbounds %"class.Eigen::array.0", %"class.Eigen::array.0"* %5, i32 0, i32 0
%8 = getelementptr inbounds [1 x i32], [1 x i32]* %7, i64 0, i64 %6
ret i32* %8
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen8internal17GenericDimReducerILin1ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator"* dereferenceable(112), i32, %"struct.Eigen::internal::SumReducer"* dereferenceable(1), float*) #4 comdat align 2 {
%5 = alloca %"struct.Eigen::TensorEvaluator"*, align 8
%6 = alloca i32, align 4
%7 = alloca %"struct.Eigen::internal::SumReducer"*, align 8
%8 = alloca float*, align 8
store %"struct.Eigen::TensorEvaluator"* %0, %"struct.Eigen::TensorEvaluator"** %5, align 8
store i32 %1, i32* %6, align 4
store %"struct.Eigen::internal::SumReducer"* %2, %"struct.Eigen::internal::SumReducer"** %7, align 8
store float* %3, float** %8, align 8
%9 = load %"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::internal::SumReducer"** %7, align 8
%10 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %5, align 8
%11 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %10, i32 0, i32 10
%12 = load i32, i32* %6, align 4
%13 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %11, i32 %12) #10
%14 = load float*, float** %8, align 8
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %9, float %13, float* %14) #10
ret void
}
; Function Attrs: convergent inlinehint nounwind
define internal i32 @_ZN5Eigen8internaldvIiLb0EEET_RKS2_RKNS0_16TensorIntDivisorIS2_XT0_EEE(i32* dereferenceable(4), %"struct.Eigen::internal::TensorIntDivisor"* dereferenceable(12)) #4 {
%3 = alloca i32*, align 8
%4 = alloca %"struct.Eigen::internal::TensorIntDivisor"*, align 8
store i32* %0, i32** %3, align 8
store %"struct.Eigen::internal::TensorIntDivisor"* %1, %"struct.Eigen::internal::TensorIntDivisor"** %4, align 8
%5 = load %"struct.Eigen::internal::TensorIntDivisor"*, %"struct.Eigen::internal::TensorIntDivisor"** %4, align 8
%6 = load i32*, i32** %3, align 8
%7 = load i32, i32* %6, align 4
%8 = call i32 @_ZNK5Eigen8internal16TensorIntDivisorIiLb0EE6divideEi(%"struct.Eigen::internal::TensorIntDivisor"* %5, i32 %7) #10
ret i32 %8
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr dereferenceable(12) %"struct.Eigen::internal::TensorIntDivisor"* @_ZNK5Eigen5arrayINS_8internal16TensorIntDivisorIiLb0EEELm1EEixEm(%"class.Eigen::array.2"*, i64) #4 comdat align 2 {
%3 = alloca %"class.Eigen::array.2"*, align 8
%4 = alloca i64, align 8
store %"class.Eigen::array.2"* %0, %"class.Eigen::array.2"** %3, align 8
store i64 %1, i64* %4, align 8
%5 = load %"class.Eigen::array.2"*, %"class.Eigen::array.2"** %3, align 8
%6 = load i64, i64* %4, align 8
%7 = getelementptr inbounds %"class.Eigen::array.2", %"class.Eigen::array.2"* %5, i32 0, i32 0
%8 = getelementptr inbounds [1 x %"struct.Eigen::internal::TensorIntDivisor"], [1 x %"struct.Eigen::internal::TensorIntDivisor"]* %7, i64 0, i64 %6
ret %"struct.Eigen::internal::TensorIntDivisor"* %8
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr i32 @_ZNK5Eigen8internal16TensorIntDivisorIiLb0EE6divideEi(%"struct.Eigen::internal::TensorIntDivisor"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::internal::TensorIntDivisor"*, align 8
%4 = alloca i32, align 4
%5 = alloca i32, align 4
%6 = alloca i32, align 4
store %"struct.Eigen::internal::TensorIntDivisor"* %0, %"struct.Eigen::internal::TensorIntDivisor"** %3, align 8
store i32 %1, i32* %4, align 4
%7 = load %"struct.Eigen::internal::TensorIntDivisor"*, %"struct.Eigen::internal::TensorIntDivisor"** %3, align 8
%8 = bitcast i32* %5 to i8*
call void @llvm.lifetime.start(i64 4, i8* %8) #9
%9 = getelementptr inbounds %"struct.Eigen::internal::TensorIntDivisor", %"struct.Eigen::internal::TensorIntDivisor"* %7, i32 0, i32 0
%10 = load i32, i32* %9, align 4
%11 = load i32, i32* %4, align 4
%12 = call i32 @_ZN5Eigen8internal12_GLOBAL__N_15muluhIiEEjjT_(i32 %10, i32 %11) #10
store i32 %12, i32* %5, align 4
%13 = bitcast i32* %6 to i8*
call void @llvm.lifetime.start(i64 4, i8* %13) #9
%14 = load i32, i32* %4, align 4
%15 = load i32, i32* %5, align 4
%16 = sub i32 %14, %15
%17 = getelementptr inbounds %"struct.Eigen::internal::TensorIntDivisor", %"struct.Eigen::internal::TensorIntDivisor"* %7, i32 0, i32 1
%18 = load i32, i32* %17, align 4
%19 = lshr i32 %16, %18
store i32 %19, i32* %6, align 4
%20 = load i32, i32* %5, align 4
%21 = load i32, i32* %6, align 4
%22 = add i32 %20, %21
%23 = getelementptr inbounds %"struct.Eigen::internal::TensorIntDivisor", %"struct.Eigen::internal::TensorIntDivisor"* %7, i32 0, i32 2
%24 = load i32, i32* %23, align 4
%25 = lshr i32 %22, %24
%26 = bitcast i32* %6 to i8*
call void @llvm.lifetime.end(i64 4, i8* %26) #9
%27 = bitcast i32* %5 to i8*
call void @llvm.lifetime.end(i64 4, i8* %27) #9
ret i32 %25
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define internal i32 @_ZN5Eigen8internal12_GLOBAL__N_15muluhIiEEjjT_(i32, i32) #2 {
%3 = alloca i32, align 4
%4 = alloca i32, align 4
store i32 %0, i32* %3, align 4
store i32 %1, i32* %4, align 4
%5 = load i32, i32* %3, align 4
%6 = load i32, i32* %4, align 4
%7 = call i32 @_ZL8__umulhijj(i32 %5, i32 %6) #10
ret i32 %7
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define internal i32 @_ZL8__umulhijj(i32, i32) #2 {
%3 = alloca i32, align 4
%4 = alloca i32, align 4
store i32 %0, i32* %3, align 4
store i32 %1, i32* %4, align 4
%5 = load i32, i32* %3, align 4
%6 = load i32, i32* %4, align 4
%7 = call i32 @__nv_umulhi(i32 %5, i32 %6) #10
ret i32 %7
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEED2Ev(%"struct.Eigen::TensorEvaluator.5"*) unnamed_addr #4 comdat align 2 {
%2 = alloca %"struct.Eigen::TensorEvaluator.5"*, align 8
store %"struct.Eigen::TensorEvaluator.5"* %0, %"struct.Eigen::TensorEvaluator.5"** %2, align 8
%3 = load %"struct.Eigen::TensorEvaluator.5"*, %"struct.Eigen::TensorEvaluator.5"** %2, align 8
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_(float, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat {
%4 = alloca float, align 4
%5 = alloca i32, align 4
%6 = alloca i32, align 4
%7 = alloca i32, align 4
store float %0, float* %4, align 4
store i32 %1, i32* %5, align 4
%8 = bitcast i32* %6 to i8*
call void @llvm.lifetime.start(i64 4, i8* %8) #9
%9 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%10 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%11 = mul i32 %9, %10
%12 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%13 = add i32 %11, %12
store i32 %13, i32* %6, align 4
%14 = bitcast i32* %7 to i8*
call void @llvm.lifetime.start(i64 4, i8* %14) #9
%15 = load i32, i32* %6, align 4
store i32 %15, i32* %7, align 4
br label %16
; <label>:16: ; preds = %26, %3
%17 = load i32, i32* %7, align 4
%18 = load i32, i32* %5, align 4
%19 = icmp slt i32 %17, %18
br i1 %19, label %22, label %20
; <label>:20: ; preds = %16
%21 = bitcast i32* %7 to i8*
call void @llvm.lifetime.end(i64 4, i8* %21) #9
br label %32
; <label>:22: ; preds = %16
%23 = load float, float* %4, align 4
%24 = load i32, i32* %7, align 4
%25 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %2, i32 %24) #10
store float %23, float* %25, align 4
br label %26
; <label>:26: ; preds = %22
%27 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%29 = mul i32 %27, %28
%30 = load i32, i32* %7, align 4
%31 = add i32 %30, %29
store i32 %31, i32* %7, align 4
br label %16
; <label>:32: ; preds = %20
%33 = bitcast i32* %6 to i8*
call void @llvm.lifetime.end(i64 4, i8* %33) #9
ret void
}
; Function Attrs: convergent nounwind
define linkonce_odr dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"*, i32) #0 comdat align 2 {
%3 = alloca %"struct.Eigen::internal::PtrWrapper"*, align 8
%4 = alloca i32, align 4
store %"struct.Eigen::internal::PtrWrapper"* %0, %"struct.Eigen::internal::PtrWrapper"** %3, align 8
store i32 %1, i32* %4, align 4
%5 = load %"struct.Eigen::internal::PtrWrapper"*, %"struct.Eigen::internal::PtrWrapper"** %3, align 8
%6 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %5, i32 0, i32 0
%7 = load float*, float** %6, align 8
%8 = load i32, i32* %4, align 4
%9 = sext i32 %8 to i64
%10 = getelementptr inbounds float, float* %7, i64 %9
ret float* %10
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca i32
%15 = alloca i32, align 4
%16 = alloca i32, align 4
%17 = alloca i32, align 4
%18 = alloca i32, align 4
%19 = alloca float, align 4
%20 = alloca i32, align 4
%21 = alloca i32, align 4
%22 = alloca i32, align 4
%23 = alloca i32, align 4
%24 = alloca i32, align 4
%25 = alloca i32, align 4
%26 = alloca i32, align 4
%27 = alloca i32, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%29 = icmp eq i32 %28, 256
br i1 %29, label %30, label %31
; <label>:30: ; preds = %5
br label %32
; <label>:31: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%34 = icmp eq i32 %33, 1
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%44 = icmp eq i32 %43, 32
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%49 = icmp eq i32 %48, 1
br i1 %49, label %50, label %51
; <label>:50: ; preds = %47
br label %52
; <label>:51: ; preds = %47
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %52
; <label>:52: ; preds = %51, %50
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%54 = icmp eq i32 %53, 1
br i1 %54, label %55, label %56
; <label>:55: ; preds = %52
br label %57
; <label>:56: ; preds = %52
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %57
; <label>:57: ; preds = %56, %55
%58 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %58) #9
store i32 16, i32* %8, align 4
%59 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %7, align 4
%61 = add nsw i32 %60, 32768
%62 = sub nsw i32 %61, 1
%63 = sdiv i32 %62, 32768
store i32 %63, i32* %9, align 4
%64 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %64) #9
%65 = load i32, i32* %9, align 4
%66 = load i32, i32* %6, align 4
%67 = mul nsw i32 %65, %66
store i32 %67, i32* %10, align 4
%68 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %68) #9
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %69, i32* %11, align 4
%70 = bitcast i32* %12 to i8*
call void @llvm.lifetime.start(i64 4, i8* %70) #9
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %71, i32* %12, align 4
%72 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %72) #9
%73 = load i32, i32* %11, align 4
store i32 %73, i32* %13, align 4
br label %74
; <label>:74: ; preds = %237, %57
%75 = load i32, i32* %13, align 4
%76 = load i32, i32* %10, align 4
%77 = icmp slt i32 %75, %76
br i1 %77, label %80, label %78
; <label>:78: ; preds = %74
store i32 2, i32* %14, align 4
%79 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %79) #9
br label %240
; <label>:80: ; preds = %74
%81 = bitcast i32* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %81) #9
%82 = load i32, i32* %13, align 4
%83 = load i32, i32* %9, align 4
%84 = srem i32 %82, %83
store i32 %84, i32* %15, align 4
%85 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = load i32, i32* %13, align 4
%87 = load i32, i32* %9, align 4
%88 = sdiv i32 %86, %87
store i32 %88, i32* %16, align 4
%89 = bitcast i32* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %89) #9
%90 = load i32, i32* %15, align 4
%91 = mul nsw i32 %90, 256
%92 = mul nsw i32 %91, 128
%93 = load i32, i32* %12, align 4
%94 = add nsw i32 %92, %93
store i32 %94, i32* %17, align 4
%95 = bitcast i32* %18 to i8*
call void @llvm.lifetime.start(i64 4, i8* %95) #9
%96 = load i32, i32* %16, align 4
store i32 %96, i32* %18, align 4
%97 = bitcast float* %19 to i8*
call void @llvm.lifetime.start(i64 4, i8* %97) #9
%98 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%99 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %98) #10
store float %99, float* %19, align 4
%100 = load i32, i32* %18, align 4
%101 = load i32, i32* %6, align 4
%102 = icmp slt i32 %100, %101
br i1 %102, label %103, label %198
; <label>:103: ; preds = %80
%104 = bitcast i32* %20 to i8*
call void @llvm.lifetime.start(i64 4, i8* %104) #9
store i32 0, i32* %20, align 4
br label %105
; <label>:105: ; preds = %192, %103
%106 = load i32, i32* %20, align 4
%107 = icmp slt i32 %106, 128
br i1 %107, label %109, label %108
; <label>:108: ; preds = %105
store i32 5, i32* %14, align 4
br label %195
; <label>:109: ; preds = %105
%110 = bitcast i32* %21 to i8*
call void @llvm.lifetime.start(i64 4, i8* %110) #9
%111 = load i32, i32* %17, align 4
%112 = load i32, i32* %20, align 4
%113 = add nsw i32 %112, 16
%114 = sub nsw i32 %113, 1
%115 = mul nsw i32 256, %114
%116 = add nsw i32 %111, %115
store i32 %116, i32* %21, align 4
%117 = load i32, i32* %21, align 4
%118 = load i32, i32* %7, align 4
%119 = icmp sge i32 %117, %118
br i1 %119, label %120, label %158
; <label>:120: ; preds = %109
%121 = bitcast i32* %22 to i8*
call void @llvm.lifetime.start(i64 4, i8* %121) #9
store i32 0, i32* %22, align 4
br label %122
; <label>:122: ; preds = %152, %120
%123 = load i32, i32* %22, align 4
%124 = icmp slt i32 %123, 15
br i1 %124, label %126, label %125
; <label>:125: ; preds = %122
store i32 8, i32* %14, align 4
br label %155
; <label>:126: ; preds = %122
%127 = bitcast i32* %23 to i8*
call void @llvm.lifetime.start(i64 4, i8* %127) #9
%128 = load i32, i32* %17, align 4
%129 = load i32, i32* %20, align 4
%130 = load i32, i32* %22, align 4
%131 = add nsw i32 %129, %130
%132 = mul nsw i32 256, %131
%133 = add nsw i32 %128, %132
store i32 %133, i32* %23, align 4
%134 = load i32, i32* %23, align 4
%135 = load i32, i32* %7, align 4
%136 = icmp sge i32 %134, %135
br i1 %136, label %137, label %138
; <label>:137: ; preds = %126
store i32 8, i32* %14, align 4
br label %148
; <label>:138: ; preds = %126
%139 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%140 = load float, float* %19, align 4
%141 = load i32, i32* %18, align 4
%142 = load i32, i32* %7, align 4
%143 = mul nsw i32 %141, %142
%144 = load i32, i32* %23, align 4
%145 = add nsw i32 %143, %144
%146 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %145) #10
%147 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %139, float %140, float %146) #10
store float %147, float* %19, align 4
store i32 0, i32* %14, align 4
br label %148
; <label>:148: ; preds = %138, %137
%149 = bitcast i32* %23 to i8*
call void @llvm.lifetime.end(i64 4, i8* %149) #9
%150 = load i32, i32* %14, align 4
switch i32 %150, label %155 [
i32 0, label %151
]
; <label>:151: ; preds = %148
br label %152
; <label>:152: ; preds = %151
%153 = load i32, i32* %22, align 4
%154 = add nsw i32 %153, 1
store i32 %154, i32* %22, align 4
br label %122, !llvm.loop !52
; <label>:155: ; preds = %148, %125
%156 = bitcast i32* %22 to i8*
call void @llvm.lifetime.end(i64 4, i8* %156) #9
br label %157
; <label>:157: ; preds = %155
store i32 5, i32* %14, align 4
br label %188
; <label>:158: ; preds = %109
%159 = bitcast i32* %24 to i8*
call void @llvm.lifetime.start(i64 4, i8* %159) #9
store i32 0, i32* %24, align 4
br label %160
; <label>:160: ; preds = %183, %158
%161 = load i32, i32* %24, align 4
%162 = icmp slt i32 %161, 16
br i1 %162, label %165, label %163
; <label>:163: ; preds = %160
store i32 11, i32* %14, align 4
%164 = bitcast i32* %24 to i8*
call void @llvm.lifetime.end(i64 4, i8* %164) #9
br label %186
; <label>:165: ; preds = %160
%166 = bitcast i32* %25 to i8*
call void @llvm.lifetime.start(i64 4, i8* %166) #9
%167 = load i32, i32* %17, align 4
%168 = load i32, i32* %20, align 4
%169 = load i32, i32* %24, align 4
%170 = add nsw i32 %168, %169
%171 = mul nsw i32 256, %170
%172 = add nsw i32 %167, %171
store i32 %172, i32* %25, align 4
%173 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%174 = load float, float* %19, align 4
%175 = load i32, i32* %18, align 4
%176 = load i32, i32* %7, align 4
%177 = mul nsw i32 %175, %176
%178 = load i32, i32* %25, align 4
%179 = add nsw i32 %177, %178
%180 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %179) #10
%181 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %173, float %174, float %180) #10
store float %181, float* %19, align 4
%182 = bitcast i32* %25 to i8*
call void @llvm.lifetime.end(i64 4, i8* %182) #9
br label %183
; <label>:183: ; preds = %165
%184 = load i32, i32* %24, align 4
%185 = add nsw i32 %184, 1
store i32 %185, i32* %24, align 4
br label %160, !llvm.loop !53
; <label>:186: ; preds = %163
br label %187
; <label>:187: ; preds = %186
store i32 0, i32* %14, align 4
br label %188
; <label>:188: ; preds = %187, %157
%189 = bitcast i32* %21 to i8*
call void @llvm.lifetime.end(i64 4, i8* %189) #9
%190 = load i32, i32* %14, align 4
switch i32 %190, label %195 [
i32 0, label %191
]
; <label>:191: ; preds = %188
br label %192
; <label>:192: ; preds = %191
%193 = load i32, i32* %20, align 4
%194 = add nsw i32 %193, 16
store i32 %194, i32* %20, align 4
br label %105, !llvm.loop !54
; <label>:195: ; preds = %188, %108
%196 = bitcast i32* %20 to i8*
call void @llvm.lifetime.end(i64 4, i8* %196) #9
br label %197
; <label>:197: ; preds = %195
br label %198
; <label>:198: ; preds = %197, %80
%199 = bitcast i32* %26 to i8*
call void @llvm.lifetime.start(i64 4, i8* %199) #9
store i32 16, i32* %26, align 4
br label %200
; <label>:200: ; preds = %212, %198
%201 = load i32, i32* %26, align 4
%202 = icmp sgt i32 %201, 0
br i1 %202, label %205, label %203
; <label>:203: ; preds = %200
store i32 14, i32* %14, align 4
%204 = bitcast i32* %26 to i8*
call void @llvm.lifetime.end(i64 4, i8* %204) #9
br label %215
; <label>:205: ; preds = %200
%206 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%207 = load float, float* %19, align 4
%208 = load i32, i32* %26, align 4
%209 = call float @_ZL11__shfl_downfji(float %207, i32 %208, i32 32) #10
%210 = load float, float* %19, align 4
%211 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %206, float %209, float %210) #10
store float %211, float* %19, align 4
br label %212
; <label>:212: ; preds = %205
%213 = load i32, i32* %26, align 4
%214 = sdiv i32 %213, 2
store i32 %214, i32* %26, align 4
br label %200, !llvm.loop !56
; <label>:215: ; preds = %203
%216 = bitcast i32* %27 to i8*
call void @llvm.lifetime.start(i64 4, i8* %216) #9
%217 = load i32, i32* %12, align 4
%218 = and i32 %217, 31
store i32 %218, i32* %27, align 4
%219 = load i32, i32* %27, align 4
%220 = icmp eq i32 %219, 0
br i1 %220, label %221, label %230
; <label>:221: ; preds = %215
%222 = load i32, i32* %18, align 4
%223 = load i32, i32* %6, align 4
%224 = icmp slt i32 %222, %223
br i1 %224, label %225, label %230
; <label>:225: ; preds = %221
%226 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%227 = load i32, i32* %18, align 4
%228 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %227) #10
%229 = load float, float* %19, align 4
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %226, float* %228, float %229) #10
br label %230
; <label>:230: ; preds = %225, %221, %215
%231 = bitcast i32* %27 to i8*
call void @llvm.lifetime.end(i64 4, i8* %231) #9
%232 = bitcast float* %19 to i8*
call void @llvm.lifetime.end(i64 4, i8* %232) #9
%233 = bitcast i32* %18 to i8*
call void @llvm.lifetime.end(i64 4, i8* %233) #9
%234 = bitcast i32* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %234) #9
%235 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %235) #9
%236 = bitcast i32* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %236) #9
br label %237
; <label>:237: ; preds = %230
%238 = load i32, i32* %13, align 4
%239 = add nsw i32 %238, 32
store i32 %239, i32* %13, align 4
br label %74
; <label>:240: ; preds = %78
%241 = bitcast i32* %12 to i8*
call void @llvm.lifetime.end(i64 4, i8* %241) #9
%242 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %242) #9
%243 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %243) #9
%244 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %244) #9
%245 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %245) #9
ret void
}
; Function Attrs: convergent inlinehint nounwind
define internal void @_ZL13__assert_failPKcS0_jS0_(i8*, i8*, i32, i8*) #4 {
%5 = alloca i8*, align 8
%6 = alloca i8*, align 8
%7 = alloca i32, align 4
%8 = alloca i8*, align 8
store i8* %0, i8** %5, align 8
store i8* %1, i8** %6, align 8
store i32 %2, i32* %7, align 4
store i8* %3, i8** %8, align 8
%9 = load i8*, i8** %5, align 8
%10 = load i8*, i8** %6, align 8
%11 = load i32, i32* %7, align 4
%12 = load i8*, i8** %8, align 8
call void @__assertfail(i8* %9, i8* %10, i32 %11, i8* %12, i64 1) #11
unreachable
; No predecessors!
ret void
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #2 comdat align 2 {
%1 = call i32 @llvm.ptx.read.ntid.y()
ret i32 %1
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #2 comdat align 2 {
%1 = call i32 @llvm.ptx.read.ntid.z()
ret i32 %1
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #2 comdat align 2 {
%1 = call i32 @llvm.ptx.read.nctaid.y()
ret i32 %1
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #2 comdat align 2 {
%1 = call i32 @llvm.ptx.read.nctaid.z()
ret i32 %1
}
; Function Attrs: convergent nounwind
define internal float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*) #0 align 2 {
%2 = alloca %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, align 8
store %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %0, %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"** %2, align 8
%3 = load %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"** %2, align 8
ret float 0.000000e+00
}
; Function Attrs: convergent nounwind
define internal float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, float, float) #0 align 2 {
%4 = alloca %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, align 8
%5 = alloca float, align 4
%6 = alloca float, align 4
store %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %0, %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"** %4, align 8
store float %1, float* %5, align 4
store float %2, float* %6, align 4
%7 = load %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"** %4, align 8
%8 = load float, float* %5, align 4
%9 = getelementptr inbounds %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer", %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %7, i32 0, i32 0
%10 = load float, float* %6, align 4
%11 = call float @_ZNK5Eigen8internal12_GLOBAL__N_18IdentityclEf(%"struct.Eigen::internal::(anonymous namespace)::Identity"* %9, float %10) #10
%12 = fadd float %8, %11
ret float %12
}
; Function Attrs: convergent nounwind
define internal void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, float*, float) #0 align 2 {
%4 = alloca %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, align 8
%5 = alloca float*, align 8
%6 = alloca float, align 4
store %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %0, %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"** %4, align 8
store float* %1, float** %5, align 8
store float %2, float* %6, align 4
%7 = load %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*, %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"** %4, align 8
%8 = load float*, float** %5, align 8
%9 = load float, float* %6, align 4
%10 = call float @_ZL9atomicAddPff(float* %8, float %9) #10
ret void
}
; Function Attrs: convergent noreturn
declare void @__assertfail(i8*, i8*, i32, i8*, i64) #7
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.ntid.y() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.ntid.z() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.nctaid.y() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.nctaid.z() #3
; Function Attrs: convergent nounwind
define internal float @_ZNK5Eigen8internal12_GLOBAL__N_18IdentityclEf(%"struct.Eigen::internal::(anonymous namespace)::Identity"*, float) #0 align 2 {
%3 = alloca %"struct.Eigen::internal::(anonymous namespace)::Identity"*, align 8
%4 = alloca float, align 4
store %"struct.Eigen::internal::(anonymous namespace)::Identity"* %0, %"struct.Eigen::internal::(anonymous namespace)::Identity"** %3, align 8
store float %1, float* %4, align 4
%5 = load %"struct.Eigen::internal::(anonymous namespace)::Identity"*, %"struct.Eigen::internal::(anonymous namespace)::Identity"** %3, align 8
%6 = load float, float* %4, align 4
ret float %6
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca i32
%15 = alloca i32, align 4
%16 = alloca i32, align 4
%17 = alloca i32, align 4
%18 = alloca i32, align 4
%19 = alloca float, align 4
%20 = alloca i32, align 4
%21 = alloca i32, align 4
%22 = alloca i32, align 4
%23 = alloca i32, align 4
%24 = alloca i32, align 4
%25 = alloca i32, align 4
%26 = alloca i32, align 4
%27 = alloca i32, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%29 = icmp eq i32 %28, 256
br i1 %29, label %30, label %31
; <label>:30: ; preds = %5
br label %32
; <label>:31: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%34 = icmp eq i32 %33, 1
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%44 = icmp eq i32 %43, 32
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%49 = icmp eq i32 %48, 1
br i1 %49, label %50, label %51
; <label>:50: ; preds = %47
br label %52
; <label>:51: ; preds = %47
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %52
; <label>:52: ; preds = %51, %50
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%54 = icmp eq i32 %53, 1
br i1 %54, label %55, label %56
; <label>:55: ; preds = %52
br label %57
; <label>:56: ; preds = %52
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %57
; <label>:57: ; preds = %56, %55
%58 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %58) #9
store i32 16, i32* %8, align 4
%59 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %7, align 4
%61 = add nsw i32 %60, 32768
%62 = sub nsw i32 %61, 1
%63 = sdiv i32 %62, 32768
store i32 %63, i32* %9, align 4
%64 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %64) #9
%65 = load i32, i32* %9, align 4
%66 = load i32, i32* %6, align 4
%67 = mul nsw i32 %65, %66
store i32 %67, i32* %10, align 4
%68 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %68) #9
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %69, i32* %11, align 4
%70 = bitcast i32* %12 to i8*
call void @llvm.lifetime.start(i64 4, i8* %70) #9
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %71, i32* %12, align 4
%72 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %72) #9
%73 = load i32, i32* %11, align 4
store i32 %73, i32* %13, align 4
br label %74
; <label>:74: ; preds = %232, %57
%75 = load i32, i32* %13, align 4
%76 = load i32, i32* %10, align 4
%77 = icmp slt i32 %75, %76
br i1 %77, label %80, label %78
; <label>:78: ; preds = %74
store i32 2, i32* %14, align 4
%79 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %79) #9
br label %235
; <label>:80: ; preds = %74
%81 = bitcast i32* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %81) #9
%82 = load i32, i32* %13, align 4
%83 = load i32, i32* %9, align 4
%84 = srem i32 %82, %83
store i32 %84, i32* %15, align 4
%85 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = load i32, i32* %13, align 4
%87 = load i32, i32* %9, align 4
%88 = sdiv i32 %86, %87
store i32 %88, i32* %16, align 4
%89 = bitcast i32* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %89) #9
%90 = load i32, i32* %15, align 4
%91 = mul nsw i32 %90, 256
%92 = mul nsw i32 %91, 128
%93 = load i32, i32* %12, align 4
%94 = add nsw i32 %92, %93
store i32 %94, i32* %17, align 4
%95 = bitcast i32* %18 to i8*
call void @llvm.lifetime.start(i64 4, i8* %95) #9
%96 = load i32, i32* %16, align 4
store i32 %96, i32* %18, align 4
%97 = bitcast float* %19 to i8*
call void @llvm.lifetime.start(i64 4, i8* %97) #9
%98 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10
store float %98, float* %19, align 4
%99 = load i32, i32* %18, align 4
%100 = load i32, i32* %6, align 4
%101 = icmp slt i32 %99, %100
br i1 %101, label %102, label %195
; <label>:102: ; preds = %80
%103 = bitcast i32* %20 to i8*
call void @llvm.lifetime.start(i64 4, i8* %103) #9
store i32 0, i32* %20, align 4
br label %104
; <label>:104: ; preds = %189, %102
%105 = load i32, i32* %20, align 4
%106 = icmp slt i32 %105, 128
br i1 %106, label %108, label %107
; <label>:107: ; preds = %104
store i32 5, i32* %14, align 4
br label %192
; <label>:108: ; preds = %104
%109 = bitcast i32* %21 to i8*
call void @llvm.lifetime.start(i64 4, i8* %109) #9
%110 = load i32, i32* %17, align 4
%111 = load i32, i32* %20, align 4
%112 = add nsw i32 %111, 16
%113 = sub nsw i32 %112, 1
%114 = mul nsw i32 256, %113
%115 = add nsw i32 %110, %114
store i32 %115, i32* %21, align 4
%116 = load i32, i32* %21, align 4
%117 = load i32, i32* %7, align 4
%118 = icmp sge i32 %116, %117
br i1 %118, label %119, label %156
; <label>:119: ; preds = %108
%120 = bitcast i32* %22 to i8*
call void @llvm.lifetime.start(i64 4, i8* %120) #9
store i32 0, i32* %22, align 4
br label %121
; <label>:121: ; preds = %150, %119
%122 = load i32, i32* %22, align 4
%123 = icmp slt i32 %122, 15
br i1 %123, label %125, label %124
; <label>:124: ; preds = %121
store i32 8, i32* %14, align 4
br label %153
; <label>:125: ; preds = %121
%126 = bitcast i32* %23 to i8*
call void @llvm.lifetime.start(i64 4, i8* %126) #9
%127 = load i32, i32* %17, align 4
%128 = load i32, i32* %20, align 4
%129 = load i32, i32* %22, align 4
%130 = add nsw i32 %128, %129
%131 = mul nsw i32 256, %130
%132 = add nsw i32 %127, %131
store i32 %132, i32* %23, align 4
%133 = load i32, i32* %23, align 4
%134 = load i32, i32* %7, align 4
%135 = icmp sge i32 %133, %134
br i1 %135, label %136, label %137
; <label>:136: ; preds = %125
store i32 8, i32* %14, align 4
br label %146
; <label>:137: ; preds = %125
%138 = load float, float* %19, align 4
%139 = load i32, i32* %18, align 4
%140 = load i32, i32* %7, align 4
%141 = mul nsw i32 %139, %140
%142 = load i32, i32* %23, align 4
%143 = add nsw i32 %141, %142
%144 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %143) #10
%145 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %138, float %144) #10
store float %145, float* %19, align 4
store i32 0, i32* %14, align 4
br label %146
; <label>:146: ; preds = %137, %136
%147 = bitcast i32* %23 to i8*
call void @llvm.lifetime.end(i64 4, i8* %147) #9
%148 = load i32, i32* %14, align 4
switch i32 %148, label %153 [
i32 0, label %149
]
; <label>:149: ; preds = %146
br label %150
; <label>:150: ; preds = %149
%151 = load i32, i32* %22, align 4
%152 = add nsw i32 %151, 1
store i32 %152, i32* %22, align 4
br label %121, !llvm.loop !57
; <label>:153: ; preds = %146, %124
%154 = bitcast i32* %22 to i8*
call void @llvm.lifetime.end(i64 4, i8* %154) #9
br label %155
; <label>:155: ; preds = %153
store i32 5, i32* %14, align 4
br label %185
; <label>:156: ; preds = %108
%157 = bitcast i32* %24 to i8*
call void @llvm.lifetime.start(i64 4, i8* %157) #9
store i32 0, i32* %24, align 4
br label %158
; <label>:158: ; preds = %180, %156
%159 = load i32, i32* %24, align 4
%160 = icmp slt i32 %159, 16
br i1 %160, label %163, label %161
; <label>:161: ; preds = %158
store i32 11, i32* %14, align 4
%162 = bitcast i32* %24 to i8*
call void @llvm.lifetime.end(i64 4, i8* %162) #9
br label %183
; <label>:163: ; preds = %158
%164 = bitcast i32* %25 to i8*
call void @llvm.lifetime.start(i64 4, i8* %164) #9
%165 = load i32, i32* %17, align 4
%166 = load i32, i32* %20, align 4
%167 = load i32, i32* %24, align 4
%168 = add nsw i32 %166, %167
%169 = mul nsw i32 256, %168
%170 = add nsw i32 %165, %169
store i32 %170, i32* %25, align 4
%171 = load float, float* %19, align 4
%172 = load i32, i32* %18, align 4
%173 = load i32, i32* %7, align 4
%174 = mul nsw i32 %172, %173
%175 = load i32, i32* %25, align 4
%176 = add nsw i32 %174, %175
%177 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %176) #10
%178 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %171, float %177) #10
store float %178, float* %19, align 4
%179 = bitcast i32* %25 to i8*
call void @llvm.lifetime.end(i64 4, i8* %179) #9
br label %180
; <label>:180: ; preds = %163
%181 = load i32, i32* %24, align 4
%182 = add nsw i32 %181, 1
store i32 %182, i32* %24, align 4
br label %158, !llvm.loop !58
; <label>:183: ; preds = %161
br label %184
; <label>:184: ; preds = %183
store i32 0, i32* %14, align 4
br label %185
; <label>:185: ; preds = %184, %155
%186 = bitcast i32* %21 to i8*
call void @llvm.lifetime.end(i64 4, i8* %186) #9
%187 = load i32, i32* %14, align 4
switch i32 %187, label %192 [
i32 0, label %188
]
; <label>:188: ; preds = %185
br label %189
; <label>:189: ; preds = %188
%190 = load i32, i32* %20, align 4
%191 = add nsw i32 %190, 16
store i32 %191, i32* %20, align 4
br label %104, !llvm.loop !59
; <label>:192: ; preds = %185, %107
%193 = bitcast i32* %20 to i8*
call void @llvm.lifetime.end(i64 4, i8* %193) #9
br label %194
; <label>:194: ; preds = %192
br label %195
; <label>:195: ; preds = %194, %80
%196 = bitcast i32* %26 to i8*
call void @llvm.lifetime.start(i64 4, i8* %196) #9
store i32 16, i32* %26, align 4
br label %197
; <label>:197: ; preds = %208, %195
%198 = load i32, i32* %26, align 4
%199 = icmp sgt i32 %198, 0
br i1 %199, label %202, label %200
; <label>:200: ; preds = %197
store i32 14, i32* %14, align 4
%201 = bitcast i32* %26 to i8*
call void @llvm.lifetime.end(i64 4, i8* %201) #9
br label %211
; <label>:202: ; preds = %197
%203 = load float, float* %19, align 4
%204 = load i32, i32* %26, align 4
%205 = call float @_ZL11__shfl_downfji(float %203, i32 %204, i32 32) #10
%206 = load float, float* %19, align 4
%207 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %205, float %206) #10
store float %207, float* %19, align 4
br label %208
; <label>:208: ; preds = %202
%209 = load i32, i32* %26, align 4
%210 = sdiv i32 %209, 2
store i32 %210, i32* %26, align 4
br label %197, !llvm.loop !60
; <label>:211: ; preds = %200
%212 = bitcast i32* %27 to i8*
call void @llvm.lifetime.start(i64 4, i8* %212) #9
%213 = load i32, i32* %12, align 4
%214 = and i32 %213, 31
store i32 %214, i32* %27, align 4
%215 = load i32, i32* %27, align 4
%216 = icmp eq i32 %215, 0
br i1 %216, label %217, label %225
; <label>:217: ; preds = %211
%218 = load i32, i32* %18, align 4
%219 = load i32, i32* %6, align 4
%220 = icmp slt i32 %218, %219
br i1 %220, label %221, label %225
; <label>:221: ; preds = %217
%222 = load i32, i32* %18, align 4
%223 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %222) #10
%224 = load float, float* %19, align 4
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %223, float %224) #10
br label %225
; <label>:225: ; preds = %221, %217, %211
%226 = bitcast i32* %27 to i8*
call void @llvm.lifetime.end(i64 4, i8* %226) #9
%227 = bitcast float* %19 to i8*
call void @llvm.lifetime.end(i64 4, i8* %227) #9
%228 = bitcast i32* %18 to i8*
call void @llvm.lifetime.end(i64 4, i8* %228) #9
%229 = bitcast i32* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %229) #9
%230 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %230) #9
%231 = bitcast i32* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %231) #9
br label %232
; <label>:232: ; preds = %225
%233 = load i32, i32* %13, align 4
%234 = add nsw i32 %233, 32
store i32 %234, i32* %13, align 4
br label %74
; <label>:235: ; preds = %78
%236 = bitcast i32* %12 to i8*
call void @llvm.lifetime.end(i64 4, i8* %236) #9
%237 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %237) #9
%238 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %238) #9
%239 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %239) #9
%240 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %240) #9
ret void
}
; Function Attrs: convergent nounwind
define internal float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*) #0 align 2 {
%2 = alloca %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, align 8
store %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"** %2, align 8
%3 = load %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"** %2, align 8
%4 = getelementptr inbounds %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %3, i32 0, i32 0
%5 = load float, float* %4, align 4
ret float %5
}
; Function Attrs: convergent nounwind
define internal float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, float, float) #0 align 2 {
%4 = alloca %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, align 8
%5 = alloca float, align 4
%6 = alloca float, align 4
store %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"** %4, align 8
store float %1, float* %5, align 4
store float %2, float* %6, align 4
%7 = load %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"** %4, align 8
%8 = load float, float* %5, align 4
%9 = load float, float* %6, align 4
%10 = call float @_ZL4fmaxff(float %8, float %9) #10
ret float %10
}
; Function Attrs: convergent nounwind
define internal void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, float*, float) #0 align 2 {
%4 = alloca %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, align 8
%5 = alloca float*, align 8
%6 = alloca float, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32
store %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"** %4, align 8
store float* %1, float** %5, align 8
store float %2, float* %6, align 4
%10 = load %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"** %4, align 8
%11 = bitcast i32* %7 to i8*
call void @llvm.lifetime.start(i64 4, i8* %11) #9
%12 = load float*, float** %5, align 8
%13 = bitcast float* %12 to i32*
%14 = load i32, i32* %13, align 4
store i32 %14, i32* %7, align 4
br label %15
; <label>:15: ; preds = %37, %3
%16 = bitcast i32* %7 to float*
%17 = load float, float* %16, align 4
%18 = load float, float* %6, align 4
%19 = fcmp olt float %17, %18
br i1 %19, label %20, label %38
; <label>:20: ; preds = %15
%21 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %21) #9
%22 = load float*, float** %5, align 8
%23 = bitcast float* %22 to i32*
%24 = load i32, i32* %7, align 4
%25 = bitcast float* %6 to i32*
%26 = load i32, i32* %25, align 4
%27 = call i32 @_ZL9atomicCASPjjj(i32* %23, i32 %24, i32 %26) #10
store i32 %27, i32* %8, align 4
%28 = load i32, i32* %7, align 4
%29 = load i32, i32* %8, align 4
%30 = icmp eq i32 %28, %29
br i1 %30, label %31, label %32
; <label>:31: ; preds = %20
store i32 3, i32* %9, align 4
br label %34
; <label>:32: ; preds = %20
%33 = load i32, i32* %8, align 4
store i32 %33, i32* %7, align 4
store i32 0, i32* %9, align 4
br label %34
; <label>:34: ; preds = %32, %31
%35 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %35) #9
%36 = load i32, i32* %9, align 4
switch i32 %36, label %40 [
i32 0, label %37
i32 3, label %38
]
; <label>:37: ; preds = %34
br label %15
; <label>:38: ; preds = %34, %15
%39 = bitcast i32* %7 to i8*
call void @llvm.lifetime.end(i64 4, i8* %39) #9
ret void
; <label>:40: ; preds = %34
unreachable
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define internal float @_ZL4fmaxff(float, float) #2 {
%3 = alloca float, align 4
%4 = alloca float, align 4
store float %0, float* %3, align 4
store float %1, float* %4, align 4
%5 = load float, float* %3, align 4
%6 = load float, float* %4, align 4
%7 = call float @_ZL5fmaxfff(float %5, float %6) #10
ret float %7
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define internal float @_ZL5fmaxfff(float, float) #2 {
%3 = alloca float, align 4
%4 = alloca float, align 4
store float %0, float* %3, align 4
store float %1, float* %4, align 4
%5 = load float, float* %3, align 4
%6 = load float, float* %4, align 4
%7 = call float @__nv_fmaxf(float %5, float %6) #10
ret float %7
}
; Function Attrs: convergent inlinehint nounwind
define internal i32 @_ZL9atomicCASPjjj(i32*, i32, i32) #4 {
%4 = alloca i32*, align 8
%5 = alloca i32, align 4
%6 = alloca i32, align 4
store i32* %0, i32** %4, align 8
store i32 %1, i32* %5, align 4
store i32 %2, i32* %6, align 4
%7 = load i32*, i32** %4, align 8
%8 = load i32, i32* %5, align 4
%9 = load i32, i32* %6, align 4
%10 = call i32 @_ZL12__uAtomicCASPjjj(i32* %7, i32 %8, i32 %9) #10
ret i32 %10
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define internal i32 @_ZL12__uAtomicCASPjjj(i32*, i32, i32) #2 {
%4 = alloca i32*, align 8
%5 = alloca i32, align 4
%6 = alloca i32, align 4
store i32* %0, i32** %4, align 8
store i32 %1, i32* %5, align 4
store i32 %2, i32* %6, align 4
%7 = load i32*, i32** %4, align 8
%8 = load i32, i32* %5, align 4
%9 = load i32, i32* %6, align 4
%10 = cmpxchg i32* %7, i32 %8, i32 %9 seq_cst seq_cst
%11 = extractvalue { i32, i1 } %10, 0
ret i32 %11
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32
%13 = alloca i32, align 4
%14 = alloca i32, align 4
%15 = alloca float, align 4
%16 = alloca i32, align 4
%17 = alloca float, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%19 = icmp eq i32 %18, 256
br i1 %19, label %20, label %21
; <label>:20: ; preds = %5
br label %22
; <label>:21: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %22
; <label>:22: ; preds = %21, %20
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%24 = icmp eq i32 %23, 1
br i1 %24, label %25, label %26
; <label>:25: ; preds = %22
br label %27
; <label>:26: ; preds = %22
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %27
; <label>:27: ; preds = %26, %25
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%29 = icmp eq i32 %28, 1
br i1 %29, label %30, label %31
; <label>:30: ; preds = %27
br label %32
; <label>:31: ; preds = %27
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%34 = icmp eq i32 %33, 128
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%44 = icmp eq i32 %43, 1
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %48) #9
%49 = load i32, i32* %6, align 4
%50 = add nsw i32 %49, 16
%51 = sub nsw i32 %50, 1
%52 = sdiv i32 %51, 16
%53 = load i32, i32* %7, align 4
%54 = mul nsw i32 %52, %53
store i32 %54, i32* %8, align 4
%55 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %55) #9
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %56, i32* %9, align 4
%57 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %57) #9
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %58, i32* %10, align 4
%59 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %9, align 4
%61 = mul nsw i32 %60, 256
%62 = load i32, i32* %10, align 4
%63 = add nsw i32 %61, %62
store i32 %63, i32* %11, align 4
br label %64
; <label>:64: ; preds = %135, %47
%65 = load i32, i32* %11, align 4
%66 = load i32, i32* %8, align 4
%67 = icmp slt i32 %65, %66
br i1 %67, label %70, label %68
; <label>:68: ; preds = %64
store i32 2, i32* %12, align 4
%69 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %69) #9
br label %138
; <label>:70: ; preds = %64
%71 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %71) #9
%72 = load i32, i32* %11, align 4
%73 = load i32, i32* %7, align 4
%74 = srem i32 %72, %73
store i32 %74, i32* %13, align 4
%75 = bitcast i32* %14 to i8*
call void @llvm.lifetime.start(i64 4, i8* %75) #9
%76 = load i32, i32* %11, align 4
%77 = load i32, i32* %7, align 4
%78 = sdiv i32 %76, %77
%79 = load i32, i32* %6, align 4
%80 = add nsw i32 %79, 16
%81 = sub nsw i32 %80, 1
%82 = sdiv i32 %81, 16
%83 = srem i32 %78, %82
%84 = mul nsw i32 %83, 16
store i32 %84, i32* %14, align 4
%85 = bitcast float* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%87 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %86) #10
store float %87, float* %15, align 4
%88 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %88) #9
store i32 0, i32* %16, align 4
br label %89
; <label>:89: ; preds = %124, %70
%90 = load i32, i32* %16, align 4
%91 = icmp slt i32 %90, 16
br i1 %91, label %94, label %92
; <label>:92: ; preds = %89
store i32 5, i32* %12, align 4
%93 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %93) #9
br label %127
; <label>:94: ; preds = %89
%95 = bitcast float* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %95) #9
%96 = load i32, i32* %13, align 4
%97 = load i32, i32* %7, align 4
%98 = icmp slt i32 %96, %97
br i1 %98, label %99, label %114
; <label>:99: ; preds = %94
%100 = load i32, i32* %14, align 4
%101 = load i32, i32* %16, align 4
%102 = add nsw i32 %100, %101
%103 = load i32, i32* %6, align 4
%104 = icmp slt i32 %102, %103
br i1 %104, label %105, label %114
; <label>:105: ; preds = %99
%106 = load i32, i32* %14, align 4
%107 = load i32, i32* %16, align 4
%108 = add nsw i32 %106, %107
%109 = load i32, i32* %7, align 4
%110 = mul nsw i32 %108, %109
%111 = load i32, i32* %13, align 4
%112 = add nsw i32 %110, %111
%113 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %112) #10
br label %117
; <label>:114: ; preds = %99, %94
%115 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%116 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %115) #10
br label %117
; <label>:117: ; preds = %114, %105
%118 = phi float [ %113, %105 ], [ %116, %114 ]
store float %118, float* %17, align 4
%119 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%120 = load float, float* %15, align 4
%121 = load float, float* %17, align 4
%122 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %119, float %120, float %121) #10
store float %122, float* %15, align 4
%123 = bitcast float* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %123) #9
br label %124
; <label>:124: ; preds = %117
%125 = load i32, i32* %16, align 4
%126 = add nsw i32 %125, 1
store i32 %126, i32* %16, align 4
br label %89
; <label>:127: ; preds = %92
%128 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%129 = load i32, i32* %13, align 4
%130 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %129) #10
%131 = load float, float* %15, align 4
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %128, float* %130, float %131) #10
%132 = bitcast float* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %132) #9
%133 = bitcast i32* %14 to i8*
call void @llvm.lifetime.end(i64 4, i8* %133) #9
%134 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %134) #9
br label %135
; <label>:135: ; preds = %127
%136 = load i32, i32* %11, align 4
%137 = add nsw i32 %136, 32768
store i32 %137, i32* %11, align 4
br label %64
; <label>:138: ; preds = %68
%139 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %139) #9
%140 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %140) #9
%141 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %141) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32
%13 = alloca i32, align 4
%14 = alloca i32, align 4
%15 = alloca float, align 4
%16 = alloca i32, align 4
%17 = alloca float, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%19 = icmp eq i32 %18, 256
br i1 %19, label %20, label %21
; <label>:20: ; preds = %5
br label %22
; <label>:21: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %22
; <label>:22: ; preds = %21, %20
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%24 = icmp eq i32 %23, 1
br i1 %24, label %25, label %26
; <label>:25: ; preds = %22
br label %27
; <label>:26: ; preds = %22
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %27
; <label>:27: ; preds = %26, %25
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%29 = icmp eq i32 %28, 1
br i1 %29, label %30, label %31
; <label>:30: ; preds = %27
br label %32
; <label>:31: ; preds = %27
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%34 = icmp eq i32 %33, 128
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%44 = icmp eq i32 %43, 1
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %48) #9
%49 = load i32, i32* %6, align 4
%50 = add nsw i32 %49, 16
%51 = sub nsw i32 %50, 1
%52 = sdiv i32 %51, 16
%53 = load i32, i32* %7, align 4
%54 = mul nsw i32 %52, %53
store i32 %54, i32* %8, align 4
%55 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %55) #9
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %56, i32* %9, align 4
%57 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %57) #9
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %58, i32* %10, align 4
%59 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %9, align 4
%61 = mul nsw i32 %60, 256
%62 = load i32, i32* %10, align 4
%63 = add nsw i32 %61, %62
store i32 %63, i32* %11, align 4
br label %64
; <label>:64: ; preds = %131, %47
%65 = load i32, i32* %11, align 4
%66 = load i32, i32* %8, align 4
%67 = icmp slt i32 %65, %66
br i1 %67, label %70, label %68
; <label>:68: ; preds = %64
store i32 2, i32* %12, align 4
%69 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %69) #9
br label %134
; <label>:70: ; preds = %64
%71 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %71) #9
%72 = load i32, i32* %11, align 4
%73 = load i32, i32* %7, align 4
%74 = srem i32 %72, %73
store i32 %74, i32* %13, align 4
%75 = bitcast i32* %14 to i8*
call void @llvm.lifetime.start(i64 4, i8* %75) #9
%76 = load i32, i32* %11, align 4
%77 = load i32, i32* %7, align 4
%78 = sdiv i32 %76, %77
%79 = load i32, i32* %6, align 4
%80 = add nsw i32 %79, 16
%81 = sub nsw i32 %80, 1
%82 = sdiv i32 %81, 16
%83 = srem i32 %78, %82
%84 = mul nsw i32 %83, 16
store i32 %84, i32* %14, align 4
%85 = bitcast float* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10
store float %86, float* %15, align 4
%87 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %87) #9
store i32 0, i32* %16, align 4
br label %88
; <label>:88: ; preds = %121, %70
%89 = load i32, i32* %16, align 4
%90 = icmp slt i32 %89, 16
br i1 %90, label %93, label %91
; <label>:91: ; preds = %88
store i32 5, i32* %12, align 4
%92 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %92) #9
br label %124
; <label>:93: ; preds = %88
%94 = bitcast float* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %94) #9
%95 = load i32, i32* %13, align 4
%96 = load i32, i32* %7, align 4
%97 = icmp slt i32 %95, %96
br i1 %97, label %98, label %113
; <label>:98: ; preds = %93
%99 = load i32, i32* %14, align 4
%100 = load i32, i32* %16, align 4
%101 = add nsw i32 %99, %100
%102 = load i32, i32* %6, align 4
%103 = icmp slt i32 %101, %102
br i1 %103, label %104, label %113
; <label>:104: ; preds = %98
%105 = load i32, i32* %14, align 4
%106 = load i32, i32* %16, align 4
%107 = add nsw i32 %105, %106
%108 = load i32, i32* %7, align 4
%109 = mul nsw i32 %107, %108
%110 = load i32, i32* %13, align 4
%111 = add nsw i32 %109, %110
%112 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %111) #10
br label %115
; <label>:113: ; preds = %98, %93
%114 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10
br label %115
; <label>:115: ; preds = %113, %104
%116 = phi float [ %112, %104 ], [ %114, %113 ]
store float %116, float* %17, align 4
%117 = load float, float* %15, align 4
%118 = load float, float* %17, align 4
%119 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %117, float %118) #10
store float %119, float* %15, align 4
%120 = bitcast float* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %120) #9
br label %121
; <label>:121: ; preds = %115
%122 = load i32, i32* %16, align 4
%123 = add nsw i32 %122, 1
store i32 %123, i32* %16, align 4
br label %88
; <label>:124: ; preds = %91
%125 = load i32, i32* %13, align 4
%126 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %125) #10
%127 = load float, float* %15, align 4
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %126, float %127) #10
%128 = bitcast float* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %128) #9
%129 = bitcast i32* %14 to i8*
call void @llvm.lifetime.end(i64 4, i8* %129) #9
%130 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %130) #9
br label %131
; <label>:131: ; preds = %124
%132 = load i32, i32* %11, align 4
%133 = add nsw i32 %132, 32768
store i32 %133, i32* %11, align 4
br label %64
; <label>:134: ; preds = %68
%135 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %135) #9
%136 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %136) #9
%137 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %137) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.6"* byval align 8, i32) #0 comdat {
%3 = alloca i32, align 4
%4 = alloca i32, align 4
%5 = alloca i32, align 4
%6 = alloca %"struct.Eigen::TensorEvaluator.6", align 8
%7 = alloca i8, align 1
%8 = alloca %"struct.Eigen::TensorEvaluator.6", align 8
store i32 %1, i32* %3, align 4
%9 = bitcast i32* %4 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
%10 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%11 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%12 = mul i32 %10, %11
%13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%14 = add i32 %12, %13
store i32 %14, i32* %4, align 4
%15 = bitcast i32* %5 to i8*
call void @llvm.lifetime.start(i64 4, i8* %15) #9
%16 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%17 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%18 = mul i32 %16, %17
store i32 %18, i32* %5, align 4
%19 = bitcast %"struct.Eigen::TensorEvaluator.6"* %6 to i8*
call void @llvm.lifetime.start(i64 168, i8* %19) #9
%20 = bitcast %"struct.Eigen::TensorEvaluator.6"* %6 to i8*
%21 = bitcast %"struct.Eigen::TensorEvaluator.6"* %0 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %20, i8* %21, i64 168, i32 8, i1 false)
call void @llvm.lifetime.start(i64 1, i8* %7) #9
store i8 0, i8* %7, align 1
%22 = bitcast %"struct.Eigen::TensorEvaluator.6"* %8 to i8*
%23 = bitcast %"struct.Eigen::TensorEvaluator.6"* %6 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %22, i8* %23, i64 168, i32 8, i1 false)
%24 = load i32, i32* %4, align 4
%25 = load i32, i32* %3, align 4
%26 = load i32, i32* %5, align 4
call void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii(%"struct.Eigen::TensorEvaluator.6"* byval align 8 %8, i32 %24, i32 %25, i32 %26) #10
call void @llvm.lifetime.end(i64 1, i8* %7) #9
%27 = bitcast %"struct.Eigen::TensorEvaluator.6"* %6 to i8*
call void @llvm.lifetime.end(i64 168, i8* %27) #9
%28 = bitcast i32* %5 to i8*
call void @llvm.lifetime.end(i64 4, i8* %28) #9
%29 = bitcast i32* %4 to i8*
call void @llvm.lifetime.end(i64 4, i8* %29) #9
ret void
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii(%"struct.Eigen::TensorEvaluator.6"* byval align 8, i32, i32, i32) #2 comdat align 2 {
%5 = alloca i32, align 4
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
store i32 %1, i32* %5, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%9 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
%10 = load i32, i32* %5, align 4
store i32 %10, i32* %8, align 4
br label %11
; <label>:11: ; preds = %19, %4
%12 = load i32, i32* %8, align 4
%13 = load i32, i32* %6, align 4
%14 = icmp slt i32 %12, %13
br i1 %14, label %17, label %15
; <label>:15: ; preds = %11
%16 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %16) #9
br label %23
; <label>:17: ; preds = %11
%18 = load i32, i32* %8, align 4
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.6"* %0, i32 %18) #10
br label %19
; <label>:19: ; preds = %17
%20 = load i32, i32* %7, align 4
%21 = load i32, i32* %8, align 4
%22 = add nsw i32 %21, %20
store i32 %22, i32* %8, align 4
br label %11
; <label>:23: ; preds = %15
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.6"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator.6"*, align 8
%4 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator.6"* %0, %"struct.Eigen::TensorEvaluator.6"** %3, align 8
store i32 %1, i32* %4, align 4
%5 = load %"struct.Eigen::TensorEvaluator.6"*, %"struct.Eigen::TensorEvaluator.6"** %3, align 8
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.6", %"struct.Eigen::TensorEvaluator.6"* %5, i32 0, i32 1
%7 = load i32, i32* %4, align 4
%8 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.8"* %6, i32 %7) #10
%9 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.6", %"struct.Eigen::TensorEvaluator.6"* %5, i32 0, i32 0
%10 = load i32, i32* %4, align 4
%11 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"* %9, i32 %10) #10
store float %8, float* %11, align 4
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.8"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator.8"*, align 8
%4 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator.8"* %0, %"struct.Eigen::TensorEvaluator.8"** %3, align 8
store i32 %1, i32* %4, align 4
%5 = load %"struct.Eigen::TensorEvaluator.8"*, %"struct.Eigen::TensorEvaluator.8"** %3, align 8
%6 = load i32, i32* %4, align 4
%7 = sext i32 %6 to i64
%8 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.8", %"struct.Eigen::TensorEvaluator.8"* %5, i32 0, i32 3
%9 = load float*, float** %8, align 8
%10 = getelementptr inbounds float, float* %9, i64 %7
%11 = load float, float* %10, align 4
ret float %11
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator.7"*, align 8
%4 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator.7"* %0, %"struct.Eigen::TensorEvaluator.7"** %3, align 8
store i32 %1, i32* %4, align 4
%5 = load %"struct.Eigen::TensorEvaluator.7"*, %"struct.Eigen::TensorEvaluator.7"** %3, align 8
%6 = load i32, i32* %4, align 4
%7 = sext i32 %6 to i64
%8 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %5, i32 0, i32 0
%9 = load float*, float** %8, align 8
%10 = getelementptr inbounds float, float* %9, i64 %7
ret float* %10
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.11"* byval align 8, i32) #0 comdat {
%3 = alloca i32, align 4
%4 = alloca i32, align 4
%5 = alloca i32, align 4
%6 = alloca %"struct.Eigen::TensorEvaluator.11", align 8
%7 = alloca i8, align 1
%8 = alloca %"struct.Eigen::TensorEvaluator.11", align 8
store i32 %1, i32* %3, align 4
%9 = bitcast i32* %4 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
%10 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%11 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%12 = mul i32 %10, %11
%13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%14 = add i32 %12, %13
store i32 %14, i32* %4, align 4
%15 = bitcast i32* %5 to i8*
call void @llvm.lifetime.start(i64 4, i8* %15) #9
%16 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%17 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%18 = mul i32 %16, %17
store i32 %18, i32* %5, align 4
%19 = bitcast %"struct.Eigen::TensorEvaluator.11"* %6 to i8*
call void @llvm.lifetime.start(i64 136, i8* %19) #9
%20 = bitcast %"struct.Eigen::TensorEvaluator.11"* %6 to i8*
%21 = bitcast %"struct.Eigen::TensorEvaluator.11"* %0 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %20, i8* %21, i64 136, i32 8, i1 false)
call void @llvm.lifetime.start(i64 1, i8* %7) #9
store i8 0, i8* %7, align 1
%22 = bitcast %"struct.Eigen::TensorEvaluator.11"* %8 to i8*
%23 = bitcast %"struct.Eigen::TensorEvaluator.11"* %6 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %22, i8* %23, i64 136, i32 8, i1 false)
%24 = load i32, i32* %4, align 4
%25 = load i32, i32* %3, align 4
%26 = load i32, i32* %5, align 4
call void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii(%"struct.Eigen::TensorEvaluator.11"* byval align 8 %8, i32 %24, i32 %25, i32 %26) #10
call void @llvm.lifetime.end(i64 1, i8* %7) #9
%27 = bitcast %"struct.Eigen::TensorEvaluator.11"* %6 to i8*
call void @llvm.lifetime.end(i64 136, i8* %27) #9
%28 = bitcast i32* %5 to i8*
call void @llvm.lifetime.end(i64 4, i8* %28) #9
%29 = bitcast i32* %4 to i8*
call void @llvm.lifetime.end(i64 4, i8* %29) #9
ret void
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii(%"struct.Eigen::TensorEvaluator.11"* byval align 8, i32, i32, i32) #2 comdat align 2 {
%5 = alloca i32, align 4
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
store i32 %1, i32* %5, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%9 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
%10 = load i32, i32* %5, align 4
store i32 %10, i32* %8, align 4
br label %11
; <label>:11: ; preds = %19, %4
%12 = load i32, i32* %8, align 4
%13 = load i32, i32* %6, align 4
%14 = icmp slt i32 %12, %13
br i1 %14, label %17, label %15
; <label>:15: ; preds = %11
%16 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %16) #9
br label %23
; <label>:17: ; preds = %11
%18 = load i32, i32* %8, align 4
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.11"* %0, i32 %18) #10
br label %19
; <label>:19: ; preds = %17
%20 = load i32, i32* %7, align 4
%21 = load i32, i32* %8, align 4
%22 = add nsw i32 %21, %20
store i32 %22, i32* %8, align 4
br label %11
; <label>:23: ; preds = %15
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.11"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator.11"*, align 8
%4 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator.11"* %0, %"struct.Eigen::TensorEvaluator.11"** %3, align 8
store i32 %1, i32* %4, align 4
%5 = load %"struct.Eigen::TensorEvaluator.11"*, %"struct.Eigen::TensorEvaluator.11"** %3, align 8
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %5, i32 0, i32 1
%7 = load i32, i32* %4, align 4
%8 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator"* %6, i32 %7) #10
%9 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %5, i32 0, i32 0
%10 = load i32, i32* %4, align 4
%11 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"* %9, i32 %10) #10
store float %8, float* %11, align 4
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_(float, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #0 comdat {
%4 = alloca float, align 4
%5 = alloca i32, align 4
%6 = alloca i32, align 4
%7 = alloca i32, align 4
store float %0, float* %4, align 4
store i32 %1, i32* %5, align 4
%8 = bitcast i32* %6 to i8*
call void @llvm.lifetime.start(i64 4, i8* %8) #9
%9 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%10 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%11 = mul i32 %9, %10
%12 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%13 = add i32 %11, %12
store i32 %13, i32* %6, align 4
%14 = bitcast i32* %7 to i8*
call void @llvm.lifetime.start(i64 4, i8* %14) #9
%15 = load i32, i32* %6, align 4
store i32 %15, i32* %7, align 4
br label %16
; <label>:16: ; preds = %26, %3
%17 = load i32, i32* %7, align 4
%18 = load i32, i32* %5, align 4
%19 = icmp slt i32 %17, %18
br i1 %19, label %22, label %20
; <label>:20: ; preds = %16
%21 = bitcast i32* %7 to i8*
call void @llvm.lifetime.end(i64 4, i8* %21) #9
br label %32
; <label>:22: ; preds = %16
%23 = load float, float* %4, align 4
%24 = load i32, i32* %7, align 4
%25 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"* %2, i32 %24) #10
store float %23, float* %25, align 4
br label %26
; <label>:26: ; preds = %22
%27 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%29 = mul i32 %27, %28
%30 = load i32, i32* %7, align 4
%31 = add i32 %30, %29
store i32 %31, i32* %7, align 4
br label %16
; <label>:32: ; preds = %20
%33 = bitcast i32* %6 to i8*
call void @llvm.lifetime.end(i64 4, i8* %33) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca i32
%15 = alloca i32, align 4
%16 = alloca i32, align 4
%17 = alloca i32, align 4
%18 = alloca i32, align 4
%19 = alloca float, align 4
%20 = alloca i32, align 4
%21 = alloca i32, align 4
%22 = alloca i32, align 4
%23 = alloca i32, align 4
%24 = alloca i32, align 4
%25 = alloca i32, align 4
%26 = alloca i32, align 4
%27 = alloca i32, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%29 = icmp eq i32 %28, 256
br i1 %29, label %30, label %31
; <label>:30: ; preds = %5
br label %32
; <label>:31: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%34 = icmp eq i32 %33, 1
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%44 = icmp eq i32 %43, 32
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%49 = icmp eq i32 %48, 1
br i1 %49, label %50, label %51
; <label>:50: ; preds = %47
br label %52
; <label>:51: ; preds = %47
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %52
; <label>:52: ; preds = %51, %50
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%54 = icmp eq i32 %53, 1
br i1 %54, label %55, label %56
; <label>:55: ; preds = %52
br label %57
; <label>:56: ; preds = %52
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %57
; <label>:57: ; preds = %56, %55
%58 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %58) #9
store i32 16, i32* %8, align 4
%59 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %7, align 4
%61 = add nsw i32 %60, 32768
%62 = sub nsw i32 %61, 1
%63 = sdiv i32 %62, 32768
store i32 %63, i32* %9, align 4
%64 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %64) #9
%65 = load i32, i32* %9, align 4
%66 = load i32, i32* %6, align 4
%67 = mul nsw i32 %65, %66
store i32 %67, i32* %10, align 4
%68 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %68) #9
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %69, i32* %11, align 4
%70 = bitcast i32* %12 to i8*
call void @llvm.lifetime.start(i64 4, i8* %70) #9
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %71, i32* %12, align 4
%72 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %72) #9
%73 = load i32, i32* %11, align 4
store i32 %73, i32* %13, align 4
br label %74
; <label>:74: ; preds = %237, %57
%75 = load i32, i32* %13, align 4
%76 = load i32, i32* %10, align 4
%77 = icmp slt i32 %75, %76
br i1 %77, label %80, label %78
; <label>:78: ; preds = %74
store i32 2, i32* %14, align 4
%79 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %79) #9
br label %240
; <label>:80: ; preds = %74
%81 = bitcast i32* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %81) #9
%82 = load i32, i32* %13, align 4
%83 = load i32, i32* %9, align 4
%84 = srem i32 %82, %83
store i32 %84, i32* %15, align 4
%85 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = load i32, i32* %13, align 4
%87 = load i32, i32* %9, align 4
%88 = sdiv i32 %86, %87
store i32 %88, i32* %16, align 4
%89 = bitcast i32* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %89) #9
%90 = load i32, i32* %15, align 4
%91 = mul nsw i32 %90, 256
%92 = mul nsw i32 %91, 128
%93 = load i32, i32* %12, align 4
%94 = add nsw i32 %92, %93
store i32 %94, i32* %17, align 4
%95 = bitcast i32* %18 to i8*
call void @llvm.lifetime.start(i64 4, i8* %95) #9
%96 = load i32, i32* %16, align 4
store i32 %96, i32* %18, align 4
%97 = bitcast float* %19 to i8*
call void @llvm.lifetime.start(i64 4, i8* %97) #9
%98 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%99 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %98) #10
store float %99, float* %19, align 4
%100 = load i32, i32* %18, align 4
%101 = load i32, i32* %6, align 4
%102 = icmp slt i32 %100, %101
br i1 %102, label %103, label %198
; <label>:103: ; preds = %80
%104 = bitcast i32* %20 to i8*
call void @llvm.lifetime.start(i64 4, i8* %104) #9
store i32 0, i32* %20, align 4
br label %105
; <label>:105: ; preds = %192, %103
%106 = load i32, i32* %20, align 4
%107 = icmp slt i32 %106, 128
br i1 %107, label %109, label %108
; <label>:108: ; preds = %105
store i32 5, i32* %14, align 4
br label %195
; <label>:109: ; preds = %105
%110 = bitcast i32* %21 to i8*
call void @llvm.lifetime.start(i64 4, i8* %110) #9
%111 = load i32, i32* %17, align 4
%112 = load i32, i32* %20, align 4
%113 = add nsw i32 %112, 16
%114 = sub nsw i32 %113, 1
%115 = mul nsw i32 256, %114
%116 = add nsw i32 %111, %115
store i32 %116, i32* %21, align 4
%117 = load i32, i32* %21, align 4
%118 = load i32, i32* %7, align 4
%119 = icmp sge i32 %117, %118
br i1 %119, label %120, label %158
; <label>:120: ; preds = %109
%121 = bitcast i32* %22 to i8*
call void @llvm.lifetime.start(i64 4, i8* %121) #9
store i32 0, i32* %22, align 4
br label %122
; <label>:122: ; preds = %152, %120
%123 = load i32, i32* %22, align 4
%124 = icmp slt i32 %123, 15
br i1 %124, label %126, label %125
; <label>:125: ; preds = %122
store i32 8, i32* %14, align 4
br label %155
; <label>:126: ; preds = %122
%127 = bitcast i32* %23 to i8*
call void @llvm.lifetime.start(i64 4, i8* %127) #9
%128 = load i32, i32* %17, align 4
%129 = load i32, i32* %20, align 4
%130 = load i32, i32* %22, align 4
%131 = add nsw i32 %129, %130
%132 = mul nsw i32 256, %131
%133 = add nsw i32 %128, %132
store i32 %133, i32* %23, align 4
%134 = load i32, i32* %23, align 4
%135 = load i32, i32* %7, align 4
%136 = icmp sge i32 %134, %135
br i1 %136, label %137, label %138
; <label>:137: ; preds = %126
store i32 8, i32* %14, align 4
br label %148
; <label>:138: ; preds = %126
%139 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%140 = load float, float* %19, align 4
%141 = load i32, i32* %18, align 4
%142 = load i32, i32* %7, align 4
%143 = mul nsw i32 %141, %142
%144 = load i32, i32* %23, align 4
%145 = add nsw i32 %143, %144
%146 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %145) #10
%147 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %139, float %140, float %146) #10
store float %147, float* %19, align 4
store i32 0, i32* %14, align 4
br label %148
; <label>:148: ; preds = %138, %137
%149 = bitcast i32* %23 to i8*
call void @llvm.lifetime.end(i64 4, i8* %149) #9
%150 = load i32, i32* %14, align 4
switch i32 %150, label %155 [
i32 0, label %151
]
; <label>:151: ; preds = %148
br label %152
; <label>:152: ; preds = %151
%153 = load i32, i32* %22, align 4
%154 = add nsw i32 %153, 1
store i32 %154, i32* %22, align 4
br label %122, !llvm.loop !61
; <label>:155: ; preds = %148, %125
%156 = bitcast i32* %22 to i8*
call void @llvm.lifetime.end(i64 4, i8* %156) #9
br label %157
; <label>:157: ; preds = %155
store i32 5, i32* %14, align 4
br label %188
; <label>:158: ; preds = %109
%159 = bitcast i32* %24 to i8*
call void @llvm.lifetime.start(i64 4, i8* %159) #9
store i32 0, i32* %24, align 4
br label %160
; <label>:160: ; preds = %183, %158
%161 = load i32, i32* %24, align 4
%162 = icmp slt i32 %161, 16
br i1 %162, label %165, label %163
; <label>:163: ; preds = %160
store i32 11, i32* %14, align 4
%164 = bitcast i32* %24 to i8*
call void @llvm.lifetime.end(i64 4, i8* %164) #9
br label %186
; <label>:165: ; preds = %160
%166 = bitcast i32* %25 to i8*
call void @llvm.lifetime.start(i64 4, i8* %166) #9
%167 = load i32, i32* %17, align 4
%168 = load i32, i32* %20, align 4
%169 = load i32, i32* %24, align 4
%170 = add nsw i32 %168, %169
%171 = mul nsw i32 256, %170
%172 = add nsw i32 %167, %171
store i32 %172, i32* %25, align 4
%173 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%174 = load float, float* %19, align 4
%175 = load i32, i32* %18, align 4
%176 = load i32, i32* %7, align 4
%177 = mul nsw i32 %175, %176
%178 = load i32, i32* %25, align 4
%179 = add nsw i32 %177, %178
%180 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %179) #10
%181 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %173, float %174, float %180) #10
store float %181, float* %19, align 4
%182 = bitcast i32* %25 to i8*
call void @llvm.lifetime.end(i64 4, i8* %182) #9
br label %183
; <label>:183: ; preds = %165
%184 = load i32, i32* %24, align 4
%185 = add nsw i32 %184, 1
store i32 %185, i32* %24, align 4
br label %160, !llvm.loop !62
; <label>:186: ; preds = %163
br label %187
; <label>:187: ; preds = %186
store i32 0, i32* %14, align 4
br label %188
; <label>:188: ; preds = %187, %157
%189 = bitcast i32* %21 to i8*
call void @llvm.lifetime.end(i64 4, i8* %189) #9
%190 = load i32, i32* %14, align 4
switch i32 %190, label %195 [
i32 0, label %191
]
; <label>:191: ; preds = %188
br label %192
; <label>:192: ; preds = %191
%193 = load i32, i32* %20, align 4
%194 = add nsw i32 %193, 16
store i32 %194, i32* %20, align 4
br label %105, !llvm.loop !63
; <label>:195: ; preds = %188, %108
%196 = bitcast i32* %20 to i8*
call void @llvm.lifetime.end(i64 4, i8* %196) #9
br label %197
; <label>:197: ; preds = %195
br label %198
; <label>:198: ; preds = %197, %80
%199 = bitcast i32* %26 to i8*
call void @llvm.lifetime.start(i64 4, i8* %199) #9
store i32 16, i32* %26, align 4
br label %200
; <label>:200: ; preds = %212, %198
%201 = load i32, i32* %26, align 4
%202 = icmp sgt i32 %201, 0
br i1 %202, label %205, label %203
; <label>:203: ; preds = %200
store i32 14, i32* %14, align 4
%204 = bitcast i32* %26 to i8*
call void @llvm.lifetime.end(i64 4, i8* %204) #9
br label %215
; <label>:205: ; preds = %200
%206 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%207 = load float, float* %19, align 4
%208 = load i32, i32* %26, align 4
%209 = call float @_ZL11__shfl_downfji(float %207, i32 %208, i32 32) #10
%210 = load float, float* %19, align 4
%211 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %206, float %209, float %210) #10
store float %211, float* %19, align 4
br label %212
; <label>:212: ; preds = %205
%213 = load i32, i32* %26, align 4
%214 = sdiv i32 %213, 2
store i32 %214, i32* %26, align 4
br label %200, !llvm.loop !64
; <label>:215: ; preds = %203
%216 = bitcast i32* %27 to i8*
call void @llvm.lifetime.start(i64 4, i8* %216) #9
%217 = load i32, i32* %12, align 4
%218 = and i32 %217, 31
store i32 %218, i32* %27, align 4
%219 = load i32, i32* %27, align 4
%220 = icmp eq i32 %219, 0
br i1 %220, label %221, label %230
; <label>:221: ; preds = %215
%222 = load i32, i32* %18, align 4
%223 = load i32, i32* %6, align 4
%224 = icmp slt i32 %222, %223
br i1 %224, label %225, label %230
; <label>:225: ; preds = %221
%226 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%227 = load i32, i32* %18, align 4
%228 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"* %4, i32 %227) #10
%229 = load float, float* %19, align 4
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %226, float* %228, float %229) #10
br label %230
; <label>:230: ; preds = %225, %221, %215
%231 = bitcast i32* %27 to i8*
call void @llvm.lifetime.end(i64 4, i8* %231) #9
%232 = bitcast float* %19 to i8*
call void @llvm.lifetime.end(i64 4, i8* %232) #9
%233 = bitcast i32* %18 to i8*
call void @llvm.lifetime.end(i64 4, i8* %233) #9
%234 = bitcast i32* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %234) #9
%235 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %235) #9
%236 = bitcast i32* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %236) #9
br label %237
; <label>:237: ; preds = %230
%238 = load i32, i32* %13, align 4
%239 = add nsw i32 %238, 32
store i32 %239, i32* %13, align 4
br label %74
; <label>:240: ; preds = %78
%241 = bitcast i32* %12 to i8*
call void @llvm.lifetime.end(i64 4, i8* %241) #9
%242 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %242) #9
%243 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %243) #9
%244 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %244) #9
%245 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %245) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca i32
%15 = alloca i32, align 4
%16 = alloca i32, align 4
%17 = alloca i32, align 4
%18 = alloca i32, align 4
%19 = alloca float, align 4
%20 = alloca i32, align 4
%21 = alloca i32, align 4
%22 = alloca i32, align 4
%23 = alloca i32, align 4
%24 = alloca i32, align 4
%25 = alloca i32, align 4
%26 = alloca i32, align 4
%27 = alloca i32, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%29 = icmp eq i32 %28, 256
br i1 %29, label %30, label %31
; <label>:30: ; preds = %5
br label %32
; <label>:31: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%34 = icmp eq i32 %33, 1
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%44 = icmp eq i32 %43, 32
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%49 = icmp eq i32 %48, 1
br i1 %49, label %50, label %51
; <label>:50: ; preds = %47
br label %52
; <label>:51: ; preds = %47
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %52
; <label>:52: ; preds = %51, %50
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%54 = icmp eq i32 %53, 1
br i1 %54, label %55, label %56
; <label>:55: ; preds = %52
br label %57
; <label>:56: ; preds = %52
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %57
; <label>:57: ; preds = %56, %55
%58 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %58) #9
store i32 16, i32* %8, align 4
%59 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %7, align 4
%61 = add nsw i32 %60, 32768
%62 = sub nsw i32 %61, 1
%63 = sdiv i32 %62, 32768
store i32 %63, i32* %9, align 4
%64 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %64) #9
%65 = load i32, i32* %9, align 4
%66 = load i32, i32* %6, align 4
%67 = mul nsw i32 %65, %66
store i32 %67, i32* %10, align 4
%68 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %68) #9
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %69, i32* %11, align 4
%70 = bitcast i32* %12 to i8*
call void @llvm.lifetime.start(i64 4, i8* %70) #9
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %71, i32* %12, align 4
%72 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %72) #9
%73 = load i32, i32* %11, align 4
store i32 %73, i32* %13, align 4
br label %74
; <label>:74: ; preds = %232, %57
%75 = load i32, i32* %13, align 4
%76 = load i32, i32* %10, align 4
%77 = icmp slt i32 %75, %76
br i1 %77, label %80, label %78
; <label>:78: ; preds = %74
store i32 2, i32* %14, align 4
%79 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %79) #9
br label %235
; <label>:80: ; preds = %74
%81 = bitcast i32* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %81) #9
%82 = load i32, i32* %13, align 4
%83 = load i32, i32* %9, align 4
%84 = srem i32 %82, %83
store i32 %84, i32* %15, align 4
%85 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = load i32, i32* %13, align 4
%87 = load i32, i32* %9, align 4
%88 = sdiv i32 %86, %87
store i32 %88, i32* %16, align 4
%89 = bitcast i32* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %89) #9
%90 = load i32, i32* %15, align 4
%91 = mul nsw i32 %90, 256
%92 = mul nsw i32 %91, 128
%93 = load i32, i32* %12, align 4
%94 = add nsw i32 %92, %93
store i32 %94, i32* %17, align 4
%95 = bitcast i32* %18 to i8*
call void @llvm.lifetime.start(i64 4, i8* %95) #9
%96 = load i32, i32* %16, align 4
store i32 %96, i32* %18, align 4
%97 = bitcast float* %19 to i8*
call void @llvm.lifetime.start(i64 4, i8* %97) #9
%98 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10
store float %98, float* %19, align 4
%99 = load i32, i32* %18, align 4
%100 = load i32, i32* %6, align 4
%101 = icmp slt i32 %99, %100
br i1 %101, label %102, label %195
; <label>:102: ; preds = %80
%103 = bitcast i32* %20 to i8*
call void @llvm.lifetime.start(i64 4, i8* %103) #9
store i32 0, i32* %20, align 4
br label %104
; <label>:104: ; preds = %189, %102
%105 = load i32, i32* %20, align 4
%106 = icmp slt i32 %105, 128
br i1 %106, label %108, label %107
; <label>:107: ; preds = %104
store i32 5, i32* %14, align 4
br label %192
; <label>:108: ; preds = %104
%109 = bitcast i32* %21 to i8*
call void @llvm.lifetime.start(i64 4, i8* %109) #9
%110 = load i32, i32* %17, align 4
%111 = load i32, i32* %20, align 4
%112 = add nsw i32 %111, 16
%113 = sub nsw i32 %112, 1
%114 = mul nsw i32 256, %113
%115 = add nsw i32 %110, %114
store i32 %115, i32* %21, align 4
%116 = load i32, i32* %21, align 4
%117 = load i32, i32* %7, align 4
%118 = icmp sge i32 %116, %117
br i1 %118, label %119, label %156
; <label>:119: ; preds = %108
%120 = bitcast i32* %22 to i8*
call void @llvm.lifetime.start(i64 4, i8* %120) #9
store i32 0, i32* %22, align 4
br label %121
; <label>:121: ; preds = %150, %119
%122 = load i32, i32* %22, align 4
%123 = icmp slt i32 %122, 15
br i1 %123, label %125, label %124
; <label>:124: ; preds = %121
store i32 8, i32* %14, align 4
br label %153
; <label>:125: ; preds = %121
%126 = bitcast i32* %23 to i8*
call void @llvm.lifetime.start(i64 4, i8* %126) #9
%127 = load i32, i32* %17, align 4
%128 = load i32, i32* %20, align 4
%129 = load i32, i32* %22, align 4
%130 = add nsw i32 %128, %129
%131 = mul nsw i32 256, %130
%132 = add nsw i32 %127, %131
store i32 %132, i32* %23, align 4
%133 = load i32, i32* %23, align 4
%134 = load i32, i32* %7, align 4
%135 = icmp sge i32 %133, %134
br i1 %135, label %136, label %137
; <label>:136: ; preds = %125
store i32 8, i32* %14, align 4
br label %146
; <label>:137: ; preds = %125
%138 = load float, float* %19, align 4
%139 = load i32, i32* %18, align 4
%140 = load i32, i32* %7, align 4
%141 = mul nsw i32 %139, %140
%142 = load i32, i32* %23, align 4
%143 = add nsw i32 %141, %142
%144 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %143) #10
%145 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %138, float %144) #10
store float %145, float* %19, align 4
store i32 0, i32* %14, align 4
br label %146
; <label>:146: ; preds = %137, %136
%147 = bitcast i32* %23 to i8*
call void @llvm.lifetime.end(i64 4, i8* %147) #9
%148 = load i32, i32* %14, align 4
switch i32 %148, label %153 [
i32 0, label %149
]
; <label>:149: ; preds = %146
br label %150
; <label>:150: ; preds = %149
%151 = load i32, i32* %22, align 4
%152 = add nsw i32 %151, 1
store i32 %152, i32* %22, align 4
br label %121, !llvm.loop !65
; <label>:153: ; preds = %146, %124
%154 = bitcast i32* %22 to i8*
call void @llvm.lifetime.end(i64 4, i8* %154) #9
br label %155
; <label>:155: ; preds = %153
store i32 5, i32* %14, align 4
br label %185
; <label>:156: ; preds = %108
%157 = bitcast i32* %24 to i8*
call void @llvm.lifetime.start(i64 4, i8* %157) #9
store i32 0, i32* %24, align 4
br label %158
; <label>:158: ; preds = %180, %156
%159 = load i32, i32* %24, align 4
%160 = icmp slt i32 %159, 16
br i1 %160, label %163, label %161
; <label>:161: ; preds = %158
store i32 11, i32* %14, align 4
%162 = bitcast i32* %24 to i8*
call void @llvm.lifetime.end(i64 4, i8* %162) #9
br label %183
; <label>:163: ; preds = %158
%164 = bitcast i32* %25 to i8*
call void @llvm.lifetime.start(i64 4, i8* %164) #9
%165 = load i32, i32* %17, align 4
%166 = load i32, i32* %20, align 4
%167 = load i32, i32* %24, align 4
%168 = add nsw i32 %166, %167
%169 = mul nsw i32 256, %168
%170 = add nsw i32 %165, %169
store i32 %170, i32* %25, align 4
%171 = load float, float* %19, align 4
%172 = load i32, i32* %18, align 4
%173 = load i32, i32* %7, align 4
%174 = mul nsw i32 %172, %173
%175 = load i32, i32* %25, align 4
%176 = add nsw i32 %174, %175
%177 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %176) #10
%178 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %171, float %177) #10
store float %178, float* %19, align 4
%179 = bitcast i32* %25 to i8*
call void @llvm.lifetime.end(i64 4, i8* %179) #9
br label %180
; <label>:180: ; preds = %163
%181 = load i32, i32* %24, align 4
%182 = add nsw i32 %181, 1
store i32 %182, i32* %24, align 4
br label %158, !llvm.loop !66
; <label>:183: ; preds = %161
br label %184
; <label>:184: ; preds = %183
store i32 0, i32* %14, align 4
br label %185
; <label>:185: ; preds = %184, %155
%186 = bitcast i32* %21 to i8*
call void @llvm.lifetime.end(i64 4, i8* %186) #9
%187 = load i32, i32* %14, align 4
switch i32 %187, label %192 [
i32 0, label %188
]
; <label>:188: ; preds = %185
br label %189
; <label>:189: ; preds = %188
%190 = load i32, i32* %20, align 4
%191 = add nsw i32 %190, 16
store i32 %191, i32* %20, align 4
br label %104, !llvm.loop !67
; <label>:192: ; preds = %185, %107
%193 = bitcast i32* %20 to i8*
call void @llvm.lifetime.end(i64 4, i8* %193) #9
br label %194
; <label>:194: ; preds = %192
br label %195
; <label>:195: ; preds = %194, %80
%196 = bitcast i32* %26 to i8*
call void @llvm.lifetime.start(i64 4, i8* %196) #9
store i32 16, i32* %26, align 4
br label %197
; <label>:197: ; preds = %208, %195
%198 = load i32, i32* %26, align 4
%199 = icmp sgt i32 %198, 0
br i1 %199, label %202, label %200
; <label>:200: ; preds = %197
store i32 14, i32* %14, align 4
%201 = bitcast i32* %26 to i8*
call void @llvm.lifetime.end(i64 4, i8* %201) #9
br label %211
; <label>:202: ; preds = %197
%203 = load float, float* %19, align 4
%204 = load i32, i32* %26, align 4
%205 = call float @_ZL11__shfl_downfji(float %203, i32 %204, i32 32) #10
%206 = load float, float* %19, align 4
%207 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %205, float %206) #10
store float %207, float* %19, align 4
br label %208
; <label>:208: ; preds = %202
%209 = load i32, i32* %26, align 4
%210 = sdiv i32 %209, 2
store i32 %210, i32* %26, align 4
br label %197, !llvm.loop !68
; <label>:211: ; preds = %200
%212 = bitcast i32* %27 to i8*
call void @llvm.lifetime.start(i64 4, i8* %212) #9
%213 = load i32, i32* %12, align 4
%214 = and i32 %213, 31
store i32 %214, i32* %27, align 4
%215 = load i32, i32* %27, align 4
%216 = icmp eq i32 %215, 0
br i1 %216, label %217, label %225
; <label>:217: ; preds = %211
%218 = load i32, i32* %18, align 4
%219 = load i32, i32* %6, align 4
%220 = icmp slt i32 %218, %219
br i1 %220, label %221, label %225
; <label>:221: ; preds = %217
%222 = load i32, i32* %18, align 4
%223 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"* %4, i32 %222) #10
%224 = load float, float* %19, align 4
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %223, float %224) #10
br label %225
; <label>:225: ; preds = %221, %217, %211
%226 = bitcast i32* %27 to i8*
call void @llvm.lifetime.end(i64 4, i8* %226) #9
%227 = bitcast float* %19 to i8*
call void @llvm.lifetime.end(i64 4, i8* %227) #9
%228 = bitcast i32* %18 to i8*
call void @llvm.lifetime.end(i64 4, i8* %228) #9
%229 = bitcast i32* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %229) #9
%230 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %230) #9
%231 = bitcast i32* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %231) #9
br label %232
; <label>:232: ; preds = %225
%233 = load i32, i32* %13, align 4
%234 = add nsw i32 %233, 32
store i32 %234, i32* %13, align 4
br label %74
; <label>:235: ; preds = %78
%236 = bitcast i32* %12 to i8*
call void @llvm.lifetime.end(i64 4, i8* %236) #9
%237 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %237) #9
%238 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %238) #9
%239 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %239) #9
%240 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %240) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32
%13 = alloca i32, align 4
%14 = alloca i32, align 4
%15 = alloca float, align 4
%16 = alloca i32, align 4
%17 = alloca float, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%19 = icmp eq i32 %18, 256
br i1 %19, label %20, label %21
; <label>:20: ; preds = %5
br label %22
; <label>:21: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %22
; <label>:22: ; preds = %21, %20
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%24 = icmp eq i32 %23, 1
br i1 %24, label %25, label %26
; <label>:25: ; preds = %22
br label %27
; <label>:26: ; preds = %22
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %27
; <label>:27: ; preds = %26, %25
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%29 = icmp eq i32 %28, 1
br i1 %29, label %30, label %31
; <label>:30: ; preds = %27
br label %32
; <label>:31: ; preds = %27
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%34 = icmp eq i32 %33, 128
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%44 = icmp eq i32 %43, 1
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %48) #9
%49 = load i32, i32* %6, align 4
%50 = add nsw i32 %49, 16
%51 = sub nsw i32 %50, 1
%52 = sdiv i32 %51, 16
%53 = load i32, i32* %7, align 4
%54 = mul nsw i32 %52, %53
store i32 %54, i32* %8, align 4
%55 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %55) #9
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %56, i32* %9, align 4
%57 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %57) #9
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %58, i32* %10, align 4
%59 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %9, align 4
%61 = mul nsw i32 %60, 256
%62 = load i32, i32* %10, align 4
%63 = add nsw i32 %61, %62
store i32 %63, i32* %11, align 4
br label %64
; <label>:64: ; preds = %135, %47
%65 = load i32, i32* %11, align 4
%66 = load i32, i32* %8, align 4
%67 = icmp slt i32 %65, %66
br i1 %67, label %70, label %68
; <label>:68: ; preds = %64
store i32 2, i32* %12, align 4
%69 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %69) #9
br label %138
; <label>:70: ; preds = %64
%71 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %71) #9
%72 = load i32, i32* %11, align 4
%73 = load i32, i32* %7, align 4
%74 = srem i32 %72, %73
store i32 %74, i32* %13, align 4
%75 = bitcast i32* %14 to i8*
call void @llvm.lifetime.start(i64 4, i8* %75) #9
%76 = load i32, i32* %11, align 4
%77 = load i32, i32* %7, align 4
%78 = sdiv i32 %76, %77
%79 = load i32, i32* %6, align 4
%80 = add nsw i32 %79, 16
%81 = sub nsw i32 %80, 1
%82 = sdiv i32 %81, 16
%83 = srem i32 %78, %82
%84 = mul nsw i32 %83, 16
store i32 %84, i32* %14, align 4
%85 = bitcast float* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%87 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %86) #10
store float %87, float* %15, align 4
%88 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %88) #9
store i32 0, i32* %16, align 4
br label %89
; <label>:89: ; preds = %124, %70
%90 = load i32, i32* %16, align 4
%91 = icmp slt i32 %90, 16
br i1 %91, label %94, label %92
; <label>:92: ; preds = %89
store i32 5, i32* %12, align 4
%93 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %93) #9
br label %127
; <label>:94: ; preds = %89
%95 = bitcast float* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %95) #9
%96 = load i32, i32* %13, align 4
%97 = load i32, i32* %7, align 4
%98 = icmp slt i32 %96, %97
br i1 %98, label %99, label %114
; <label>:99: ; preds = %94
%100 = load i32, i32* %14, align 4
%101 = load i32, i32* %16, align 4
%102 = add nsw i32 %100, %101
%103 = load i32, i32* %6, align 4
%104 = icmp slt i32 %102, %103
br i1 %104, label %105, label %114
; <label>:105: ; preds = %99
%106 = load i32, i32* %14, align 4
%107 = load i32, i32* %16, align 4
%108 = add nsw i32 %106, %107
%109 = load i32, i32* %7, align 4
%110 = mul nsw i32 %108, %109
%111 = load i32, i32* %13, align 4
%112 = add nsw i32 %110, %111
%113 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %112) #10
br label %117
; <label>:114: ; preds = %99, %94
%115 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%116 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %115) #10
br label %117
; <label>:117: ; preds = %114, %105
%118 = phi float [ %113, %105 ], [ %116, %114 ]
store float %118, float* %17, align 4
%119 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%120 = load float, float* %15, align 4
%121 = load float, float* %17, align 4
%122 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %119, float %120, float %121) #10
store float %122, float* %15, align 4
%123 = bitcast float* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %123) #9
br label %124
; <label>:124: ; preds = %117
%125 = load i32, i32* %16, align 4
%126 = add nsw i32 %125, 1
store i32 %126, i32* %16, align 4
br label %89
; <label>:127: ; preds = %92
%128 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%129 = load i32, i32* %13, align 4
%130 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"* %4, i32 %129) #10
%131 = load float, float* %15, align 4
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %128, float* %130, float %131) #10
%132 = bitcast float* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %132) #9
%133 = bitcast i32* %14 to i8*
call void @llvm.lifetime.end(i64 4, i8* %133) #9
%134 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %134) #9
br label %135
; <label>:135: ; preds = %127
%136 = load i32, i32* %11, align 4
%137 = add nsw i32 %136, 32768
store i32 %137, i32* %11, align 4
br label %64
; <label>:138: ; preds = %68
%139 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %139) #9
%140 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %140) #9
%141 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %141) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32
%13 = alloca i32, align 4
%14 = alloca i32, align 4
%15 = alloca float, align 4
%16 = alloca i32, align 4
%17 = alloca float, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%19 = icmp eq i32 %18, 256
br i1 %19, label %20, label %21
; <label>:20: ; preds = %5
br label %22
; <label>:21: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %22
; <label>:22: ; preds = %21, %20
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%24 = icmp eq i32 %23, 1
br i1 %24, label %25, label %26
; <label>:25: ; preds = %22
br label %27
; <label>:26: ; preds = %22
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %27
; <label>:27: ; preds = %26, %25
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%29 = icmp eq i32 %28, 1
br i1 %29, label %30, label %31
; <label>:30: ; preds = %27
br label %32
; <label>:31: ; preds = %27
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%34 = icmp eq i32 %33, 128
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%44 = icmp eq i32 %43, 1
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %48) #9
%49 = load i32, i32* %6, align 4
%50 = add nsw i32 %49, 16
%51 = sub nsw i32 %50, 1
%52 = sdiv i32 %51, 16
%53 = load i32, i32* %7, align 4
%54 = mul nsw i32 %52, %53
store i32 %54, i32* %8, align 4
%55 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %55) #9
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %56, i32* %9, align 4
%57 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %57) #9
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %58, i32* %10, align 4
%59 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %9, align 4
%61 = mul nsw i32 %60, 256
%62 = load i32, i32* %10, align 4
%63 = add nsw i32 %61, %62
store i32 %63, i32* %11, align 4
br label %64
; <label>:64: ; preds = %131, %47
%65 = load i32, i32* %11, align 4
%66 = load i32, i32* %8, align 4
%67 = icmp slt i32 %65, %66
br i1 %67, label %70, label %68
; <label>:68: ; preds = %64
store i32 2, i32* %12, align 4
%69 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %69) #9
br label %134
; <label>:70: ; preds = %64
%71 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %71) #9
%72 = load i32, i32* %11, align 4
%73 = load i32, i32* %7, align 4
%74 = srem i32 %72, %73
store i32 %74, i32* %13, align 4
%75 = bitcast i32* %14 to i8*
call void @llvm.lifetime.start(i64 4, i8* %75) #9
%76 = load i32, i32* %11, align 4
%77 = load i32, i32* %7, align 4
%78 = sdiv i32 %76, %77
%79 = load i32, i32* %6, align 4
%80 = add nsw i32 %79, 16
%81 = sub nsw i32 %80, 1
%82 = sdiv i32 %81, 16
%83 = srem i32 %78, %82
%84 = mul nsw i32 %83, 16
store i32 %84, i32* %14, align 4
%85 = bitcast float* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10
store float %86, float* %15, align 4
%87 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %87) #9
store i32 0, i32* %16, align 4
br label %88
; <label>:88: ; preds = %121, %70
%89 = load i32, i32* %16, align 4
%90 = icmp slt i32 %89, 16
br i1 %90, label %93, label %91
; <label>:91: ; preds = %88
store i32 5, i32* %12, align 4
%92 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %92) #9
br label %124
; <label>:93: ; preds = %88
%94 = bitcast float* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %94) #9
%95 = load i32, i32* %13, align 4
%96 = load i32, i32* %7, align 4
%97 = icmp slt i32 %95, %96
br i1 %97, label %98, label %113
; <label>:98: ; preds = %93
%99 = load i32, i32* %14, align 4
%100 = load i32, i32* %16, align 4
%101 = add nsw i32 %99, %100
%102 = load i32, i32* %6, align 4
%103 = icmp slt i32 %101, %102
br i1 %103, label %104, label %113
; <label>:104: ; preds = %98
%105 = load i32, i32* %14, align 4
%106 = load i32, i32* %16, align 4
%107 = add nsw i32 %105, %106
%108 = load i32, i32* %7, align 4
%109 = mul nsw i32 %107, %108
%110 = load i32, i32* %13, align 4
%111 = add nsw i32 %109, %110
%112 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %1, i32 %111) #10
br label %115
; <label>:113: ; preds = %98, %93
%114 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10
br label %115
; <label>:115: ; preds = %113, %104
%116 = phi float [ %112, %104 ], [ %114, %113 ]
store float %116, float* %17, align 4
%117 = load float, float* %15, align 4
%118 = load float, float* %17, align 4
%119 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %117, float %118) #10
store float %119, float* %15, align 4
%120 = bitcast float* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %120) #9
br label %121
; <label>:121: ; preds = %115
%122 = load i32, i32* %16, align 4
%123 = add nsw i32 %122, 1
store i32 %123, i32* %16, align 4
br label %88
; <label>:124: ; preds = %91
%125 = load i32, i32* %13, align 4
%126 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.7"* %4, i32 %125) #10
%127 = load float, float* %15, align 4
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %126, float %127) #10
%128 = bitcast float* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %128) #9
%129 = bitcast i32* %14 to i8*
call void @llvm.lifetime.end(i64 4, i8* %129) #9
%130 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %130) #9
br label %131
; <label>:131: ; preds = %124
%132 = load i32, i32* %11, align 4
%133 = add nsw i32 %132, 32768
store i32 %133, i32* %11, align 4
br label %64
; <label>:134: ; preds = %68
%135 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %135) #9
%136 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %136) #9
%137 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %137) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, float*) #0 comdat {
%5 = alloca i32, align 4
%6 = alloca float*, align 8
%7 = alloca i32, align 4
%8 = alloca float, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca float, align 4
%15 = alloca i32, align 4
store i32 %2, i32* %5, align 4
store float* %3, float** %6, align 8
%16 = bitcast i32* %7 to i8*
call void @llvm.lifetime.start(i64 4, i8* %16) #9
%17 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%18 = mul i32 %17, 256
%19 = mul i32 %18, 128
%20 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%21 = add i32 %19, %20
store i32 %21, i32* %7, align 4
%22 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%23 = icmp eq i32 %22, 1
br i1 %23, label %24, label %31
; <label>:24: ; preds = %4
%25 = load i32, i32* %7, align 4
%26 = icmp eq i32 %25, 0
br i1 %26, label %27, label %30
; <label>:27: ; preds = %24
%28 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10
%29 = load float*, float** %6, align 8
store float %28, float* %29, align 4
br label %30
; <label>:30: ; preds = %27, %24
call void @llvm.cuda.syncthreads()
br label %31
; <label>:31: ; preds = %30, %4
%32 = bitcast float* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %32) #9
%33 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10
store float %33, float* %8, align 4
%34 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %34) #9
%35 = load i32, i32* %5, align 4
%36 = load i32, i32* %7, align 4
%37 = sub nsw i32 %35, %36
store i32 %37, i32* %10, align 4
store i32 32768, i32* %11, align 4
%38 = call i32 @_ZN5Eigen6numext4miniIiEET_RKS2_S4_(i32* dereferenceable(4) %10, i32* dereferenceable(4) %11) #10
store i32 %38, i32* %9, align 4
%39 = bitcast i32* %12 to i8*
call void @llvm.lifetime.start(i64 4, i8* %39) #9
store i32 0, i32* %12, align 4
br label %40
; <label>:40: ; preds = %58, %31
%41 = load i32, i32* %12, align 4
%42 = load i32, i32* %9, align 4
%43 = icmp slt i32 %41, %42
br i1 %43, label %46, label %44
; <label>:44: ; preds = %40
%45 = bitcast i32* %12 to i8*
call void @llvm.lifetime.end(i64 4, i8* %45) #9
br label %61
; <label>:46: ; preds = %40
%47 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %47) #9
%48 = load i32, i32* %7, align 4
%49 = load i32, i32* %12, align 4
%50 = add nsw i32 %48, %49
store i32 %50, i32* %13, align 4
%51 = bitcast float* %14 to i8*
call void @llvm.lifetime.start(i64 4, i8* %51) #9
%52 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i32 0, i32 10
%53 = load i32, i32* %13, align 4
%54 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %52, i32 %53) #10
store float %54, float* %14, align 4
%55 = load float, float* %14, align 4
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %55, float* %8) #10
%56 = bitcast float* %14 to i8*
call void @llvm.lifetime.end(i64 4, i8* %56) #9
%57 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %57) #9
br label %58
; <label>:58: ; preds = %46
%59 = load i32, i32* %12, align 4
%60 = add nsw i32 %59, 256
store i32 %60, i32* %12, align 4
br label %40, !llvm.loop !69
; <label>:61: ; preds = %44
%62 = bitcast i32* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %62) #9
store i32 16, i32* %15, align 4
br label %63
; <label>:63: ; preds = %72, %61
%64 = load i32, i32* %15, align 4
%65 = icmp sgt i32 %64, 0
br i1 %65, label %68, label %66
; <label>:66: ; preds = %63
%67 = bitcast i32* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %67) #9
br label %75
; <label>:68: ; preds = %63
%69 = load float, float* %8, align 4
%70 = load i32, i32* %15, align 4
%71 = call float @_ZL11__shfl_downfji(float %69, i32 %70, i32 32) #10
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %71, float* %8) #10
br label %72
; <label>:72: ; preds = %68
%73 = load i32, i32* %15, align 4
%74 = sdiv i32 %73, 2
store i32 %74, i32* %15, align 4
br label %63, !llvm.loop !70
; <label>:75: ; preds = %66
%76 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%77 = and i32 %76, 31
%78 = icmp eq i32 %77, 0
br i1 %78, label %79, label %82
; <label>:79: ; preds = %75
%80 = load float*, float** %6, align 8
%81 = load float, float* %8, align 4
call void @_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE(float* %80, float %81, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %0) #10
br label %82
; <label>:82: ; preds = %79, %75
%83 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %83) #9
%84 = bitcast float* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %84) #9
%85 = bitcast i32* %7 to i8*
call void @llvm.lifetime.end(i64 4, i8* %85) #9
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator.13"*, align 8
%4 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator.13"* %0, %"struct.Eigen::TensorEvaluator.13"** %3, align 8
store i32 %1, i32* %4, align 4
%5 = load %"struct.Eigen::TensorEvaluator.13"*, %"struct.Eigen::TensorEvaluator.13"** %3, align 8
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %5, i32 0, i32 0
%7 = load float*, float** %6, align 8
%8 = load i32, i32* %4, align 4
%9 = sext i32 %8 to i64
%10 = getelementptr inbounds float, float* %7, i64 %9
%11 = call float @_ZN5Eigen12_GLOBAL__N_112loadConstantIfEET_PKS2_(float* %10) #10
ret float %11
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, i32, float*) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca float*, align 8
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca i32, align 4
%15 = alloca i32, align 4
%16 = alloca i32
%17 = alloca i32, align 4
%18 = alloca i32, align 4
%19 = alloca i32, align 4
%20 = alloca float, align 4
%21 = alloca i32, align 4
%22 = alloca i32, align 4
%23 = alloca i32, align 4
%24 = alloca float, align 4
%25 = alloca i32, align 4
%26 = alloca i32, align 4
%27 = alloca i32, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
store float* %4, float** %8, align 8
%28 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %28) #9
store i32 16, i32* %9, align 4
%29 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %29) #9
%30 = load i32, i32* %6, align 4
%31 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%32 = mul i32 %31, 128
%33 = call i32 @_ZN5Eigen5divupIiijEET_T0_T1_(i32 %30, i32 %32) #10
store i32 %33, i32* %10, align 4
%34 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %34) #9
%35 = load i32, i32* %10, align 4
%36 = load i32, i32* %7, align 4
%37 = mul nsw i32 %35, %36
store i32 %37, i32* %11, align 4
%38 = bitcast i32* %12 to i8*
call void @llvm.lifetime.start(i64 4, i8* %38) #9
%39 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%40 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%41 = mul i32 %39, %40
store i32 %41, i32* %12, align 4
%42 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %42) #9
%43 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%44 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%45 = mul i32 %43, %44
%46 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%47 = add i32 %45, %46
store i32 %47, i32* %13, align 4
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%49 = icmp eq i32 %48, 1
br i1 %49, label %50, label %70
; <label>:50: ; preds = %5
%51 = bitcast i32* %14 to i8*
call void @llvm.lifetime.start(i64 4, i8* %51) #9
%52 = load i32, i32* %13, align 4
store i32 %52, i32* %14, align 4
br label %53
; <label>:53: ; preds = %65, %50
%54 = load i32, i32* %14, align 4
%55 = load i32, i32* %7, align 4
%56 = icmp slt i32 %54, %55
br i1 %56, label %59, label %57
; <label>:57: ; preds = %53
%58 = bitcast i32* %14 to i8*
call void @llvm.lifetime.end(i64 4, i8* %58) #9
br label %69
; <label>:59: ; preds = %53
%60 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10
%61 = load i32, i32* %14, align 4
%62 = sext i32 %61 to i64
%63 = load float*, float** %8, align 8
%64 = getelementptr inbounds float, float* %63, i64 %62
store float %60, float* %64, align 4
br label %65
; <label>:65: ; preds = %59
%66 = load i32, i32* %12, align 4
%67 = load i32, i32* %14, align 4
%68 = add nsw i32 %67, %66
store i32 %68, i32* %14, align 4
br label %53
; <label>:69: ; preds = %57
br label %70
; <label>:70: ; preds = %69, %5
%71 = bitcast i32* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %71) #9
%72 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %72, i32* %15, align 4
br label %73
; <label>:73: ; preds = %215, %70
%74 = load i32, i32* %15, align 4
%75 = load i32, i32* %11, align 4
%76 = icmp slt i32 %74, %75
br i1 %76, label %79, label %77
; <label>:77: ; preds = %73
store i32 5, i32* %16, align 4
%78 = bitcast i32* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %78) #9
br label %219
; <label>:79: ; preds = %73
%80 = bitcast i32* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %80) #9
%81 = load i32, i32* %15, align 4
%82 = load i32, i32* %10, align 4
%83 = sdiv i32 %81, %82
store i32 %83, i32* %17, align 4
%84 = load i32, i32* %17, align 4
%85 = load i32, i32* %7, align 4
%86 = icmp slt i32 %84, %85
br i1 %86, label %87, label %213
; <label>:87: ; preds = %79
%88 = bitcast i32* %18 to i8*
call void @llvm.lifetime.start(i64 4, i8* %88) #9
%89 = load i32, i32* %15, align 4
%90 = load i32, i32* %10, align 4
%91 = srem i32 %89, %90
store i32 %91, i32* %18, align 4
%92 = bitcast i32* %19 to i8*
call void @llvm.lifetime.start(i64 4, i8* %92) #9
%93 = load i32, i32* %18, align 4
%94 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%95 = mul i32 %93, %94
%96 = mul i32 %95, 128
%97 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%98 = add i32 %96, %97
store i32 %98, i32* %19, align 4
%99 = bitcast float* %20 to i8*
call void @llvm.lifetime.start(i64 4, i8* %99) #9
%100 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10
store float %100, float* %20, align 4
%101 = bitcast i32* %21 to i8*
call void @llvm.lifetime.start(i64 4, i8* %101) #9
store i32 0, i32* %21, align 4
br label %102
; <label>:102: ; preds = %180, %87
%103 = load i32, i32* %21, align 4
%104 = icmp slt i32 %103, 128
br i1 %104, label %106, label %105
; <label>:105: ; preds = %102
store i32 8, i32* %16, align 4
br label %183
; <label>:106: ; preds = %102
%107 = bitcast i32* %22 to i8*
call void @llvm.lifetime.start(i64 4, i8* %107) #9
%108 = load i32, i32* %19, align 4
%109 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%110 = load i32, i32* %21, align 4
%111 = add nsw i32 %110, 16
%112 = sub nsw i32 %111, 1
%113 = mul i32 %109, %112
%114 = add i32 %108, %113
store i32 %114, i32* %22, align 4
%115 = load i32, i32* %22, align 4
%116 = load i32, i32* %6, align 4
%117 = icmp sge i32 %115, %116
br i1 %117, label %118, label %147
; <label>:118: ; preds = %106
%119 = bitcast i32* %23 to i8*
call void @llvm.lifetime.start(i64 4, i8* %119) #9
%120 = load i32, i32* %19, align 4
%121 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%122 = load i32, i32* %21, align 4
%123 = mul i32 %121, %122
%124 = add i32 %120, %123
store i32 %124, i32* %23, align 4
br label %125
; <label>:125: ; preds = %142, %118
%126 = load i32, i32* %23, align 4
%127 = load i32, i32* %6, align 4
%128 = icmp slt i32 %126, %127
br i1 %128, label %131, label %129
; <label>:129: ; preds = %125
store i32 11, i32* %16, align 4
%130 = bitcast i32* %23 to i8*
call void @llvm.lifetime.end(i64 4, i8* %130) #9
br label %146
; <label>:131: ; preds = %125
%132 = bitcast float* %24 to i8*
call void @llvm.lifetime.start(i64 4, i8* %132) #9
%133 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i32 0, i32 10
%134 = load i32, i32* %17, align 4
%135 = load i32, i32* %6, align 4
%136 = mul nsw i32 %134, %135
%137 = load i32, i32* %23, align 4
%138 = add nsw i32 %136, %137
%139 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %133, i32 %138) #10
store float %139, float* %24, align 4
%140 = load float, float* %24, align 4
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %140, float* %20) #10
%141 = bitcast float* %24 to i8*
call void @llvm.lifetime.end(i64 4, i8* %141) #9
br label %142
; <label>:142: ; preds = %131
%143 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%144 = load i32, i32* %23, align 4
%145 = add i32 %144, %143
store i32 %145, i32* %23, align 4
br label %125
; <label>:146: ; preds = %129
store i32 8, i32* %16, align 4
br label %176
; <label>:147: ; preds = %106
%148 = bitcast i32* %25 to i8*
call void @llvm.lifetime.start(i64 4, i8* %148) #9
store i32 0, i32* %25, align 4
br label %149
; <label>:149: ; preds = %171, %147
%150 = load i32, i32* %25, align 4
%151 = icmp slt i32 %150, 16
br i1 %151, label %154, label %152
; <label>:152: ; preds = %149
store i32 14, i32* %16, align 4
%153 = bitcast i32* %25 to i8*
call void @llvm.lifetime.end(i64 4, i8* %153) #9
br label %174
; <label>:154: ; preds = %149
%155 = bitcast i32* %26 to i8*
call void @llvm.lifetime.start(i64 4, i8* %155) #9
%156 = load i32, i32* %19, align 4
%157 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%158 = load i32, i32* %21, align 4
%159 = load i32, i32* %25, align 4
%160 = add nsw i32 %158, %159
%161 = mul i32 %157, %160
%162 = add i32 %156, %161
store i32 %162, i32* %26, align 4
%163 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i32 0, i32 10
%164 = load i32, i32* %17, align 4
%165 = load i32, i32* %6, align 4
%166 = mul nsw i32 %164, %165
%167 = load i32, i32* %26, align 4
%168 = add nsw i32 %166, %167
%169 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %163, i32 %168) #10
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %169, float* %20) #10
%170 = bitcast i32* %26 to i8*
call void @llvm.lifetime.end(i64 4, i8* %170) #9
br label %171
; <label>:171: ; preds = %154
%172 = load i32, i32* %25, align 4
%173 = add nsw i32 %172, 1
store i32 %173, i32* %25, align 4
br label %149, !llvm.loop !71
; <label>:174: ; preds = %152
br label %175
; <label>:175: ; preds = %174
store i32 0, i32* %16, align 4
br label %176
; <label>:176: ; preds = %175, %146
%177 = bitcast i32* %22 to i8*
call void @llvm.lifetime.end(i64 4, i8* %177) #9
%178 = load i32, i32* %16, align 4
switch i32 %178, label %183 [
i32 0, label %179
]
; <label>:179: ; preds = %176
br label %180
; <label>:180: ; preds = %179
%181 = load i32, i32* %21, align 4
%182 = add nsw i32 %181, 16
store i32 %182, i32* %21, align 4
br label %102
; <label>:183: ; preds = %176, %105
%184 = bitcast i32* %21 to i8*
call void @llvm.lifetime.end(i64 4, i8* %184) #9
br label %185
; <label>:185: ; preds = %183
call void @llvm.cuda.syncthreads()
%186 = bitcast i32* %27 to i8*
call void @llvm.lifetime.start(i64 4, i8* %186) #9
store i32 16, i32* %27, align 4
br label %187
; <label>:187: ; preds = %196, %185
%188 = load i32, i32* %27, align 4
%189 = icmp sgt i32 %188, 0
br i1 %189, label %192, label %190
; <label>:190: ; preds = %187
store i32 17, i32* %16, align 4
%191 = bitcast i32* %27 to i8*
call void @llvm.lifetime.end(i64 4, i8* %191) #9
br label %199
; <label>:192: ; preds = %187
%193 = load float, float* %20, align 4
%194 = load i32, i32* %27, align 4
%195 = call float @_ZL11__shfl_downfji(float %193, i32 %194, i32 32) #10
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %195, float* %20) #10
br label %196
; <label>:196: ; preds = %192
%197 = load i32, i32* %27, align 4
%198 = sdiv i32 %197, 2
store i32 %198, i32* %27, align 4
br label %187
; <label>:199: ; preds = %190
%200 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%201 = and i32 %200, 31
%202 = icmp eq i32 %201, 0
br i1 %202, label %203, label %209
; <label>:203: ; preds = %199
%204 = load i32, i32* %17, align 4
%205 = sext i32 %204 to i64
%206 = load float*, float** %8, align 8
%207 = getelementptr inbounds float, float* %206, i64 %205
%208 = load float, float* %20, align 4
call void @_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE(float* %207, float %208, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %0) #10
br label %209
; <label>:209: ; preds = %203, %199
%210 = bitcast float* %20 to i8*
call void @llvm.lifetime.end(i64 4, i8* %210) #9
%211 = bitcast i32* %19 to i8*
call void @llvm.lifetime.end(i64 4, i8* %211) #9
%212 = bitcast i32* %18 to i8*
call void @llvm.lifetime.end(i64 4, i8* %212) #9
br label %213
; <label>:213: ; preds = %209, %79
call void @llvm.cuda.syncthreads()
%214 = bitcast i32* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %214) #9
br label %215
; <label>:215: ; preds = %213
%216 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%217 = load i32, i32* %15, align 4
%218 = add i32 %217, %216
store i32 %218, i32* %15, align 4
br label %73
; <label>:219: ; preds = %77
%220 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %220) #9
%221 = bitcast i32* %12 to i8*
call void @llvm.lifetime.end(i64 4, i8* %221) #9
%222 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %222) #9
%223 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %223) #9
%224 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %224) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, i32, float*) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca float*, align 8
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca i32
%15 = alloca i32, align 4
%16 = alloca i32, align 4
%17 = alloca float, align 4
%18 = alloca i32, align 4
%19 = alloca i32, align 4
%20 = alloca i32, align 4
%21 = alloca float, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
store float* %4, float** %8, align 8
%22 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %22) #9
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%24 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%25 = mul i32 %23, %24
store i32 %25, i32* %9, align 4
%26 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %26) #9
%27 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%29 = mul i32 %27, %28
%30 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%31 = add i32 %29, %30
store i32 %31, i32* %10, align 4
%32 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%33 = icmp eq i32 %32, 1
br i1 %33, label %34, label %54
; <label>:34: ; preds = %5
%35 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %35) #9
%36 = load i32, i32* %10, align 4
store i32 %36, i32* %11, align 4
br label %37
; <label>:37: ; preds = %49, %34
%38 = load i32, i32* %11, align 4
%39 = load i32, i32* %7, align 4
%40 = icmp slt i32 %38, %39
br i1 %40, label %43, label %41
; <label>:41: ; preds = %37
%42 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %42) #9
br label %53
; <label>:43: ; preds = %37
%44 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10
%45 = load i32, i32* %11, align 4
%46 = sext i32 %45 to i64
%47 = load float*, float** %8, align 8
%48 = getelementptr inbounds float, float* %47, i64 %46
store float %44, float* %48, align 4
br label %49
; <label>:49: ; preds = %43
%50 = load i32, i32* %9, align 4
%51 = load i32, i32* %11, align 4
%52 = add nsw i32 %51, %50
store i32 %52, i32* %11, align 4
br label %37
; <label>:53: ; preds = %41
call void @llvm.cuda.syncthreads()
br label %54
; <label>:54: ; preds = %53, %5
%55 = bitcast i32* %12 to i8*
call void @llvm.lifetime.start(i64 4, i8* %55) #9
%56 = load i32, i32* %7, align 4
%57 = load i32, i32* %6, align 4
%58 = call i32 @_ZN5Eigen5divupIiEET_S1_S1_(i32 %57, i32 16) #10
%59 = mul nsw i32 %56, %58
store i32 %59, i32* %12, align 4
%60 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %60) #9
%61 = load i32, i32* %10, align 4
store i32 %61, i32* %13, align 4
br label %62
; <label>:62: ; preds = %116, %54
%63 = load i32, i32* %13, align 4
%64 = load i32, i32* %12, align 4
%65 = icmp slt i32 %63, %64
br i1 %65, label %68, label %66
; <label>:66: ; preds = %62
store i32 5, i32* %14, align 4
%67 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %67) #9
br label %120
; <label>:68: ; preds = %62
%69 = bitcast i32* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %69) #9
%70 = load i32, i32* %13, align 4
%71 = load i32, i32* %7, align 4
%72 = srem i32 %70, %71
store i32 %72, i32* %15, align 4
%73 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %73) #9
%74 = load i32, i32* %13, align 4
%75 = load i32, i32* %7, align 4
%76 = sdiv i32 %74, %75
%77 = mul nsw i32 %76, 16
store i32 %77, i32* %16, align 4
%78 = bitcast float* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %78) #9
%79 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %0) #10
store float %79, float* %17, align 4
%80 = bitcast i32* %18 to i8*
call void @llvm.lifetime.start(i64 4, i8* %80) #9
%81 = load i32, i32* %16, align 4
%82 = add nsw i32 %81, 16
store i32 %82, i32* %19, align 4
%83 = call i32 @_ZN5Eigen6numext4miniIiEET_RKS2_S4_(i32* dereferenceable(4) %19, i32* dereferenceable(4) %6) #10
store i32 %83, i32* %18, align 4
%84 = bitcast i32* %20 to i8*
call void @llvm.lifetime.start(i64 4, i8* %84) #9
%85 = load i32, i32* %16, align 4
store i32 %85, i32* %20, align 4
br label %86
; <label>:86: ; preds = %103, %68
%87 = load i32, i32* %20, align 4
%88 = load i32, i32* %18, align 4
%89 = icmp slt i32 %87, %88
br i1 %89, label %92, label %90
; <label>:90: ; preds = %86
store i32 8, i32* %14, align 4
%91 = bitcast i32* %20 to i8*
call void @llvm.lifetime.end(i64 4, i8* %91) #9
br label %106
; <label>:92: ; preds = %86
%93 = bitcast float* %21 to i8*
call void @llvm.lifetime.start(i64 4, i8* %93) #9
%94 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i32 0, i32 10
%95 = load i32, i32* %20, align 4
%96 = load i32, i32* %7, align 4
%97 = mul nsw i32 %95, %96
%98 = load i32, i32* %15, align 4
%99 = add nsw i32 %97, %98
%100 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %94, i32 %99) #10
store float %100, float* %21, align 4
%101 = load float, float* %21, align 4
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %0, float %101, float* %17) #10
%102 = bitcast float* %21 to i8*
call void @llvm.lifetime.end(i64 4, i8* %102) #9
br label %103
; <label>:103: ; preds = %92
%104 = load i32, i32* %20, align 4
%105 = add nsw i32 %104, 1
store i32 %105, i32* %20, align 4
br label %86
; <label>:106: ; preds = %90
%107 = load i32, i32* %15, align 4
%108 = sext i32 %107 to i64
%109 = load float*, float** %8, align 8
%110 = getelementptr inbounds float, float* %109, i64 %108
%111 = load float, float* %17, align 4
call void @_ZN5Eigen8internal12atomicReduceIfEEvPT_S2_RNS0_10SumReducerIS2_EE(float* %110, float %111, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %0) #10
%112 = bitcast i32* %18 to i8*
call void @llvm.lifetime.end(i64 4, i8* %112) #9
%113 = bitcast float* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %113) #9
%114 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %114) #9
%115 = bitcast i32* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %115) #9
br label %116
; <label>:116: ; preds = %106
%117 = load i32, i32* %9, align 4
%118 = load i32, i32* %13, align 4
%119 = add nsw i32 %118, %117
store i32 %119, i32* %13, align 4
br label %62
; <label>:120: ; preds = %66
%121 = bitcast i32* %12 to i8*
call void @llvm.lifetime.end(i64 4, i8* %121) #9
%122 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %122) #9
%123 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %123) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.14"* byval align 8, i32) #0 comdat {
%3 = alloca i32, align 4
%4 = alloca i32, align 4
%5 = alloca i32, align 4
%6 = alloca %"struct.Eigen::TensorEvaluator.14", align 8
%7 = alloca i8, align 1
%8 = alloca %"struct.Eigen::TensorEvaluator.14", align 8
store i32 %1, i32* %3, align 4
%9 = bitcast i32* %4 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
%10 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%11 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%12 = mul i32 %10, %11
%13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%14 = add i32 %12, %13
store i32 %14, i32* %4, align 4
%15 = bitcast i32* %5 to i8*
call void @llvm.lifetime.start(i64 4, i8* %15) #9
%16 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%17 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%18 = mul i32 %16, %17
store i32 %18, i32* %5, align 4
%19 = bitcast %"struct.Eigen::TensorEvaluator.14"* %6 to i8*
call void @llvm.lifetime.start(i64 128, i8* %19) #9
%20 = bitcast %"struct.Eigen::TensorEvaluator.14"* %6 to i8*
%21 = bitcast %"struct.Eigen::TensorEvaluator.14"* %0 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %20, i8* %21, i64 128, i32 8, i1 false)
call void @llvm.lifetime.start(i64 1, i8* %7) #9
store i8 0, i8* %7, align 1
%22 = bitcast %"struct.Eigen::TensorEvaluator.14"* %8 to i8*
%23 = bitcast %"struct.Eigen::TensorEvaluator.14"* %6 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %22, i8* %23, i64 128, i32 8, i1 false)
%24 = load i32, i32* %4, align 4
%25 = load i32, i32* %3, align 4
%26 = load i32, i32* %5, align 4
call void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii(%"struct.Eigen::TensorEvaluator.14"* byval align 8 %8, i32 %24, i32 %25, i32 %26) #10
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEED1Ev(%"struct.Eigen::TensorEvaluator.14"* %8) #5
call void @llvm.lifetime.end(i64 1, i8* %7) #9
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEED1Ev(%"struct.Eigen::TensorEvaluator.14"* %6) #5
%27 = bitcast %"struct.Eigen::TensorEvaluator.14"* %6 to i8*
call void @llvm.lifetime.end(i64 128, i8* %27) #9
%28 = bitcast i32* %5 to i8*
call void @llvm.lifetime.end(i64 4, i8* %28) #9
%29 = bitcast i32* %4 to i8*
call void @llvm.lifetime.end(i64 4, i8* %29) #9
ret void
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii(%"struct.Eigen::TensorEvaluator.14"* byval align 8, i32, i32, i32) #2 comdat align 2 {
%5 = alloca i32, align 4
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
store i32 %1, i32* %5, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%9 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
%10 = load i32, i32* %5, align 4
store i32 %10, i32* %8, align 4
br label %11
; <label>:11: ; preds = %19, %4
%12 = load i32, i32* %8, align 4
%13 = load i32, i32* %6, align 4
%14 = icmp slt i32 %12, %13
br i1 %14, label %17, label %15
; <label>:15: ; preds = %11
%16 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %16) #9
br label %23
; <label>:17: ; preds = %11
%18 = load i32, i32* %8, align 4
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.14"* %0, i32 %18) #10
br label %19
; <label>:19: ; preds = %17
%20 = load i32, i32* %7, align 4
%21 = load i32, i32* %8, align 4
%22 = add nsw i32 %21, %20
store i32 %22, i32* %8, align 4
br label %11
; <label>:23: ; preds = %15
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEED1Ev(%"struct.Eigen::TensorEvaluator.14"*) unnamed_addr #4 comdat align 2 {
%2 = alloca %"struct.Eigen::TensorEvaluator.14"*, align 8
store %"struct.Eigen::TensorEvaluator.14"* %0, %"struct.Eigen::TensorEvaluator.14"** %2, align 8
%3 = load %"struct.Eigen::TensorEvaluator.14"*, %"struct.Eigen::TensorEvaluator.14"** %2, align 8
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEED2Ev(%"struct.Eigen::TensorEvaluator.14"* %3) #5
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.14"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator.14"*, align 8
%4 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator.14"* %0, %"struct.Eigen::TensorEvaluator.14"** %3, align 8
store i32 %1, i32* %4, align 4
%5 = load %"struct.Eigen::TensorEvaluator.14"*, %"struct.Eigen::TensorEvaluator.14"** %3, align 8
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %5, i32 0, i32 0
%7 = load i32, i32* %4, align 4
%8 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.12"* %6, i32 %7) #10
%9 = load i32, i32* %4, align 4
%10 = sext i32 %9 to i64
%11 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %5, i32 0, i32 2
%12 = load float*, float** %11, align 8
%13 = getelementptr inbounds float, float* %12, i64 %10
store float %8, float* %13, align 4
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.12"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator.12"*, align 8
%4 = alloca i32, align 4
%5 = alloca %"struct.Eigen::internal::SumReducer", align 1
%6 = alloca float, align 4
store %"struct.Eigen::TensorEvaluator.12"* %0, %"struct.Eigen::TensorEvaluator.12"** %3, align 8
store i32 %1, i32* %4, align 4
%7 = load %"struct.Eigen::TensorEvaluator.12"*, %"struct.Eigen::TensorEvaluator.12"** %3, align 8
%8 = bitcast %"struct.Eigen::internal::SumReducer"* %5 to i8*
call void @llvm.lifetime.start(i64 1, i8* %8) #9
%9 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %7, i32 0, i32 11
%10 = bitcast float* %6 to i8*
call void @llvm.lifetime.start(i64 4, i8* %10) #9
%11 = call float @_ZNK5Eigen8internal10SumReducerIfE10initializeEv(%"struct.Eigen::internal::SumReducer"* %5) #10
store float %11, float* %6, align 4
%12 = load i32, i32* %4, align 4
%13 = call i32 @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEE10firstInputEi(%"struct.Eigen::TensorEvaluator.12"* %7, i32 %12) #10
call void @_ZN5Eigen8internal17GenericDimReducerILi0ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator.12"* dereferenceable(112) %7, i32 %13, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %5, float* %6) #10
%14 = load float, float* %6, align 4
%15 = call float @_ZNK5Eigen8internal10SumReducerIfE8finalizeEf(%"struct.Eigen::internal::SumReducer"* %5, float %14) #10
%16 = bitcast float* %6 to i8*
call void @llvm.lifetime.end(i64 4, i8* %16) #9
%17 = bitcast %"struct.Eigen::internal::SumReducer"* %5 to i8*
call void @llvm.lifetime.end(i64 1, i8* %17) #9
ret float %15
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen8internal17GenericDimReducerILi0ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator.12"* dereferenceable(112), i32, %"struct.Eigen::internal::SumReducer"* dereferenceable(1), float*) #4 comdat align 2 {
%5 = alloca %"struct.Eigen::TensorEvaluator.12"*, align 8
%6 = alloca i32, align 4
%7 = alloca %"struct.Eigen::internal::SumReducer"*, align 8
%8 = alloca float*, align 8
%9 = alloca i32, align 4
%10 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator.12"* %0, %"struct.Eigen::TensorEvaluator.12"** %5, align 8
store i32 %1, i32* %6, align 4
store %"struct.Eigen::internal::SumReducer"* %2, %"struct.Eigen::internal::SumReducer"** %7, align 8
store float* %3, float** %8, align 8
%11 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %11) #9
store i32 0, i32* %9, align 4
br label %12
; <label>:12: ; preds = %36, %4
%13 = load i32, i32* %9, align 4
%14 = load %"struct.Eigen::TensorEvaluator.12"*, %"struct.Eigen::TensorEvaluator.12"** %5, align 8
%15 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %14, i32 0, i32 9
%16 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %15, i64 0) #10
%17 = load i32, i32* %16, align 4
%18 = icmp slt i32 %13, %17
br i1 %18, label %21, label %19
; <label>:19: ; preds = %12
%20 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %20) #9
br label %39
; <label>:21: ; preds = %12
%22 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %22) #9
%23 = load i32, i32* %6, align 4
%24 = load i32, i32* %9, align 4
%25 = load %"struct.Eigen::TensorEvaluator.12"*, %"struct.Eigen::TensorEvaluator.12"** %5, align 8
%26 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %25, i32 0, i32 8
%27 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %26, i64 0) #10
%28 = load i32, i32* %27, align 4
%29 = mul nsw i32 %24, %28
%30 = add nsw i32 %23, %29
store i32 %30, i32* %10, align 4
%31 = load %"struct.Eigen::TensorEvaluator.12"*, %"struct.Eigen::TensorEvaluator.12"** %5, align 8
%32 = load i32, i32* %10, align 4
%33 = load %"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::internal::SumReducer"** %7, align 8
%34 = load float*, float** %8, align 8
call void @_ZN5Eigen8internal17GenericDimReducerILin1ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator.12"* dereferenceable(112) %31, i32 %32, %"struct.Eigen::internal::SumReducer"* dereferenceable(1) %33, float* %34) #10
%35 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %35) #9
br label %36
; <label>:36: ; preds = %21
%37 = load i32, i32* %9, align 4
%38 = add nsw i32 %37, 1
store i32 %38, i32* %9, align 4
br label %12
; <label>:39: ; preds = %19
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr i32 @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEE10firstInputEi(%"struct.Eigen::TensorEvaluator.12"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator.12"*, align 8
%4 = alloca i32, align 4
%5 = alloca i32, align 4
%6 = alloca i32, align 4
%7 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator.12"* %0, %"struct.Eigen::TensorEvaluator.12"** %3, align 8
store i32 %1, i32* %4, align 4
%8 = load %"struct.Eigen::TensorEvaluator.12"*, %"struct.Eigen::TensorEvaluator.12"** %3, align 8
%9 = bitcast i32* %5 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
store i32 0, i32* %5, align 4
%10 = bitcast i32* %6 to i8*
call void @llvm.lifetime.start(i64 4, i8* %10) #9
store i32 0, i32* %6, align 4
br label %11
; <label>:11: ; preds = %42, %2
%12 = load i32, i32* %6, align 4
%13 = icmp sgt i32 %12, 0
br i1 %13, label %16, label %14
; <label>:14: ; preds = %11
%15 = bitcast i32* %6 to i8*
call void @llvm.lifetime.end(i64 4, i8* %15) #9
br label %45
; <label>:16: ; preds = %11
%17 = bitcast i32* %7 to i8*
call void @llvm.lifetime.start(i64 4, i8* %17) #9
%18 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %8, i32 0, i32 4
%19 = load i32, i32* %6, align 4
%20 = sext i32 %19 to i64
%21 = call dereferenceable(12) %"struct.Eigen::internal::TensorIntDivisor"* @_ZNK5Eigen5arrayINS_8internal16TensorIntDivisorIiLb0EEELm1EEixEm(%"class.Eigen::array.2"* %18, i64 %20) #10
%22 = call i32 @_ZN5Eigen8internaldvIiLb0EEET_RKS2_RKNS0_16TensorIntDivisorIS2_XT0_EEE(i32* dereferenceable(4) %4, %"struct.Eigen::internal::TensorIntDivisor"* dereferenceable(12) %21) #10
store i32 %22, i32* %7, align 4
%23 = load i32, i32* %7, align 4
%24 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %8, i32 0, i32 5
%25 = load i32, i32* %6, align 4
%26 = sext i32 %25 to i64
%27 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %24, i64 %26) #10
%28 = load i32, i32* %27, align 4
%29 = mul nsw i32 %23, %28
%30 = load i32, i32* %5, align 4
%31 = add nsw i32 %30, %29
store i32 %31, i32* %5, align 4
%32 = load i32, i32* %7, align 4
%33 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %8, i32 0, i32 3
%34 = load i32, i32* %6, align 4
%35 = sext i32 %34 to i64
%36 = call dereferenceable(4) i32* @_ZNK5Eigen5arrayIiLm1EEixEm(%"class.Eigen::array.0"* %33, i64 %35) #10
%37 = load i32, i32* %36, align 4
%38 = mul nsw i32 %32, %37
%39 = load i32, i32* %4, align 4
%40 = sub nsw i32 %39, %38
store i32 %40, i32* %4, align 4
%41 = bitcast i32* %7 to i8*
call void @llvm.lifetime.end(i64 4, i8* %41) #9
br label %42
; <label>:42: ; preds = %16
%43 = load i32, i32* %6, align 4
%44 = add nsw i32 %43, -1
store i32 %44, i32* %6, align 4
br label %11
; <label>:45: ; preds = %14
%46 = load i32, i32* %4, align 4
%47 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %8, i32 0, i32 7
%48 = load i32, i32* %47, align 8
%49 = mul nsw i32 %46, %48
%50 = load i32, i32* %5, align 4
%51 = add nsw i32 %50, %49
store i32 %51, i32* %5, align 4
%52 = load i32, i32* %5, align 4
%53 = bitcast i32* %5 to i8*
call void @llvm.lifetime.end(i64 4, i8* %53) #9
ret i32 %52
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen8internal17GenericDimReducerILin1ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_E6reduceERKSH_iRS5_Pf(%"struct.Eigen::TensorEvaluator.12"* dereferenceable(112), i32, %"struct.Eigen::internal::SumReducer"* dereferenceable(1), float*) #4 comdat align 2 {
%5 = alloca %"struct.Eigen::TensorEvaluator.12"*, align 8
%6 = alloca i32, align 4
%7 = alloca %"struct.Eigen::internal::SumReducer"*, align 8
%8 = alloca float*, align 8
store %"struct.Eigen::TensorEvaluator.12"* %0, %"struct.Eigen::TensorEvaluator.12"** %5, align 8
store i32 %1, i32* %6, align 4
store %"struct.Eigen::internal::SumReducer"* %2, %"struct.Eigen::internal::SumReducer"** %7, align 8
store float* %3, float** %8, align 8
%9 = load %"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::internal::SumReducer"** %7, align 8
%10 = load %"struct.Eigen::TensorEvaluator.12"*, %"struct.Eigen::TensorEvaluator.12"** %5, align 8
%11 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %10, i32 0, i32 10
%12 = load i32, i32* %6, align 4
%13 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %11, i32 %12) #10
%14 = load float*, float** %8, align 8
call void @_ZNK5Eigen8internal10SumReducerIfE6reduceEfPf(%"struct.Eigen::internal::SumReducer"* %9, float %13, float* %14) #10
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEED2Ev(%"struct.Eigen::TensorEvaluator.14"*) unnamed_addr #4 comdat align 2 {
%2 = alloca %"struct.Eigen::TensorEvaluator.14"*, align 8
store %"struct.Eigen::TensorEvaluator.14"* %0, %"struct.Eigen::TensorEvaluator.14"** %2, align 8
%3 = load %"struct.Eigen::TensorEvaluator.14"*, %"struct.Eigen::TensorEvaluator.14"** %2, align 8
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca i32
%15 = alloca i32, align 4
%16 = alloca i32, align 4
%17 = alloca i32, align 4
%18 = alloca i32, align 4
%19 = alloca float, align 4
%20 = alloca i32, align 4
%21 = alloca i32, align 4
%22 = alloca i32, align 4
%23 = alloca i32, align 4
%24 = alloca i32, align 4
%25 = alloca i32, align 4
%26 = alloca i32, align 4
%27 = alloca i32, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%29 = icmp eq i32 %28, 256
br i1 %29, label %30, label %31
; <label>:30: ; preds = %5
br label %32
; <label>:31: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%34 = icmp eq i32 %33, 1
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%44 = icmp eq i32 %43, 32
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%49 = icmp eq i32 %48, 1
br i1 %49, label %50, label %51
; <label>:50: ; preds = %47
br label %52
; <label>:51: ; preds = %47
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %52
; <label>:52: ; preds = %51, %50
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%54 = icmp eq i32 %53, 1
br i1 %54, label %55, label %56
; <label>:55: ; preds = %52
br label %57
; <label>:56: ; preds = %52
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %57
; <label>:57: ; preds = %56, %55
%58 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %58) #9
store i32 16, i32* %8, align 4
%59 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %7, align 4
%61 = add nsw i32 %60, 32768
%62 = sub nsw i32 %61, 1
%63 = sdiv i32 %62, 32768
store i32 %63, i32* %9, align 4
%64 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %64) #9
%65 = load i32, i32* %9, align 4
%66 = load i32, i32* %6, align 4
%67 = mul nsw i32 %65, %66
store i32 %67, i32* %10, align 4
%68 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %68) #9
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %69, i32* %11, align 4
%70 = bitcast i32* %12 to i8*
call void @llvm.lifetime.start(i64 4, i8* %70) #9
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %71, i32* %12, align 4
%72 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %72) #9
%73 = load i32, i32* %11, align 4
store i32 %73, i32* %13, align 4
br label %74
; <label>:74: ; preds = %237, %57
%75 = load i32, i32* %13, align 4
%76 = load i32, i32* %10, align 4
%77 = icmp slt i32 %75, %76
br i1 %77, label %80, label %78
; <label>:78: ; preds = %74
store i32 2, i32* %14, align 4
%79 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %79) #9
br label %240
; <label>:80: ; preds = %74
%81 = bitcast i32* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %81) #9
%82 = load i32, i32* %13, align 4
%83 = load i32, i32* %9, align 4
%84 = srem i32 %82, %83
store i32 %84, i32* %15, align 4
%85 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = load i32, i32* %13, align 4
%87 = load i32, i32* %9, align 4
%88 = sdiv i32 %86, %87
store i32 %88, i32* %16, align 4
%89 = bitcast i32* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %89) #9
%90 = load i32, i32* %15, align 4
%91 = mul nsw i32 %90, 256
%92 = mul nsw i32 %91, 128
%93 = load i32, i32* %12, align 4
%94 = add nsw i32 %92, %93
store i32 %94, i32* %17, align 4
%95 = bitcast i32* %18 to i8*
call void @llvm.lifetime.start(i64 4, i8* %95) #9
%96 = load i32, i32* %16, align 4
store i32 %96, i32* %18, align 4
%97 = bitcast float* %19 to i8*
call void @llvm.lifetime.start(i64 4, i8* %97) #9
%98 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%99 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %98) #10
store float %99, float* %19, align 4
%100 = load i32, i32* %18, align 4
%101 = load i32, i32* %6, align 4
%102 = icmp slt i32 %100, %101
br i1 %102, label %103, label %198
; <label>:103: ; preds = %80
%104 = bitcast i32* %20 to i8*
call void @llvm.lifetime.start(i64 4, i8* %104) #9
store i32 0, i32* %20, align 4
br label %105
; <label>:105: ; preds = %192, %103
%106 = load i32, i32* %20, align 4
%107 = icmp slt i32 %106, 128
br i1 %107, label %109, label %108
; <label>:108: ; preds = %105
store i32 5, i32* %14, align 4
br label %195
; <label>:109: ; preds = %105
%110 = bitcast i32* %21 to i8*
call void @llvm.lifetime.start(i64 4, i8* %110) #9
%111 = load i32, i32* %17, align 4
%112 = load i32, i32* %20, align 4
%113 = add nsw i32 %112, 16
%114 = sub nsw i32 %113, 1
%115 = mul nsw i32 256, %114
%116 = add nsw i32 %111, %115
store i32 %116, i32* %21, align 4
%117 = load i32, i32* %21, align 4
%118 = load i32, i32* %7, align 4
%119 = icmp sge i32 %117, %118
br i1 %119, label %120, label %158
; <label>:120: ; preds = %109
%121 = bitcast i32* %22 to i8*
call void @llvm.lifetime.start(i64 4, i8* %121) #9
store i32 0, i32* %22, align 4
br label %122
; <label>:122: ; preds = %152, %120
%123 = load i32, i32* %22, align 4
%124 = icmp slt i32 %123, 15
br i1 %124, label %126, label %125
; <label>:125: ; preds = %122
store i32 8, i32* %14, align 4
br label %155
; <label>:126: ; preds = %122
%127 = bitcast i32* %23 to i8*
call void @llvm.lifetime.start(i64 4, i8* %127) #9
%128 = load i32, i32* %17, align 4
%129 = load i32, i32* %20, align 4
%130 = load i32, i32* %22, align 4
%131 = add nsw i32 %129, %130
%132 = mul nsw i32 256, %131
%133 = add nsw i32 %128, %132
store i32 %133, i32* %23, align 4
%134 = load i32, i32* %23, align 4
%135 = load i32, i32* %7, align 4
%136 = icmp sge i32 %134, %135
br i1 %136, label %137, label %138
; <label>:137: ; preds = %126
store i32 8, i32* %14, align 4
br label %148
; <label>:138: ; preds = %126
%139 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%140 = load float, float* %19, align 4
%141 = load i32, i32* %18, align 4
%142 = load i32, i32* %7, align 4
%143 = mul nsw i32 %141, %142
%144 = load i32, i32* %23, align 4
%145 = add nsw i32 %143, %144
%146 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %145) #10
%147 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %139, float %140, float %146) #10
store float %147, float* %19, align 4
store i32 0, i32* %14, align 4
br label %148
; <label>:148: ; preds = %138, %137
%149 = bitcast i32* %23 to i8*
call void @llvm.lifetime.end(i64 4, i8* %149) #9
%150 = load i32, i32* %14, align 4
switch i32 %150, label %155 [
i32 0, label %151
]
; <label>:151: ; preds = %148
br label %152
; <label>:152: ; preds = %151
%153 = load i32, i32* %22, align 4
%154 = add nsw i32 %153, 1
store i32 %154, i32* %22, align 4
br label %122, !llvm.loop !72
; <label>:155: ; preds = %148, %125
%156 = bitcast i32* %22 to i8*
call void @llvm.lifetime.end(i64 4, i8* %156) #9
br label %157
; <label>:157: ; preds = %155
store i32 5, i32* %14, align 4
br label %188
; <label>:158: ; preds = %109
%159 = bitcast i32* %24 to i8*
call void @llvm.lifetime.start(i64 4, i8* %159) #9
store i32 0, i32* %24, align 4
br label %160
; <label>:160: ; preds = %183, %158
%161 = load i32, i32* %24, align 4
%162 = icmp slt i32 %161, 16
br i1 %162, label %165, label %163
; <label>:163: ; preds = %160
store i32 11, i32* %14, align 4
%164 = bitcast i32* %24 to i8*
call void @llvm.lifetime.end(i64 4, i8* %164) #9
br label %186
; <label>:165: ; preds = %160
%166 = bitcast i32* %25 to i8*
call void @llvm.lifetime.start(i64 4, i8* %166) #9
%167 = load i32, i32* %17, align 4
%168 = load i32, i32* %20, align 4
%169 = load i32, i32* %24, align 4
%170 = add nsw i32 %168, %169
%171 = mul nsw i32 256, %170
%172 = add nsw i32 %167, %171
store i32 %172, i32* %25, align 4
%173 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%174 = load float, float* %19, align 4
%175 = load i32, i32* %18, align 4
%176 = load i32, i32* %7, align 4
%177 = mul nsw i32 %175, %176
%178 = load i32, i32* %25, align 4
%179 = add nsw i32 %177, %178
%180 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %179) #10
%181 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %173, float %174, float %180) #10
store float %181, float* %19, align 4
%182 = bitcast i32* %25 to i8*
call void @llvm.lifetime.end(i64 4, i8* %182) #9
br label %183
; <label>:183: ; preds = %165
%184 = load i32, i32* %24, align 4
%185 = add nsw i32 %184, 1
store i32 %185, i32* %24, align 4
br label %160, !llvm.loop !73
; <label>:186: ; preds = %163
br label %187
; <label>:187: ; preds = %186
store i32 0, i32* %14, align 4
br label %188
; <label>:188: ; preds = %187, %157
%189 = bitcast i32* %21 to i8*
call void @llvm.lifetime.end(i64 4, i8* %189) #9
%190 = load i32, i32* %14, align 4
switch i32 %190, label %195 [
i32 0, label %191
]
; <label>:191: ; preds = %188
br label %192
; <label>:192: ; preds = %191
%193 = load i32, i32* %20, align 4
%194 = add nsw i32 %193, 16
store i32 %194, i32* %20, align 4
br label %105, !llvm.loop !74
; <label>:195: ; preds = %188, %108
%196 = bitcast i32* %20 to i8*
call void @llvm.lifetime.end(i64 4, i8* %196) #9
br label %197
; <label>:197: ; preds = %195
br label %198
; <label>:198: ; preds = %197, %80
%199 = bitcast i32* %26 to i8*
call void @llvm.lifetime.start(i64 4, i8* %199) #9
store i32 16, i32* %26, align 4
br label %200
; <label>:200: ; preds = %212, %198
%201 = load i32, i32* %26, align 4
%202 = icmp sgt i32 %201, 0
br i1 %202, label %205, label %203
; <label>:203: ; preds = %200
store i32 14, i32* %14, align 4
%204 = bitcast i32* %26 to i8*
call void @llvm.lifetime.end(i64 4, i8* %204) #9
br label %215
; <label>:205: ; preds = %200
%206 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%207 = load float, float* %19, align 4
%208 = load i32, i32* %26, align 4
%209 = call float @_ZL11__shfl_downfji(float %207, i32 %208, i32 32) #10
%210 = load float, float* %19, align 4
%211 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %206, float %209, float %210) #10
store float %211, float* %19, align 4
br label %212
; <label>:212: ; preds = %205
%213 = load i32, i32* %26, align 4
%214 = sdiv i32 %213, 2
store i32 %214, i32* %26, align 4
br label %200, !llvm.loop !75
; <label>:215: ; preds = %203
%216 = bitcast i32* %27 to i8*
call void @llvm.lifetime.start(i64 4, i8* %216) #9
%217 = load i32, i32* %12, align 4
%218 = and i32 %217, 31
store i32 %218, i32* %27, align 4
%219 = load i32, i32* %27, align 4
%220 = icmp eq i32 %219, 0
br i1 %220, label %221, label %230
; <label>:221: ; preds = %215
%222 = load i32, i32* %18, align 4
%223 = load i32, i32* %6, align 4
%224 = icmp slt i32 %222, %223
br i1 %224, label %225, label %230
; <label>:225: ; preds = %221
%226 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%227 = load i32, i32* %18, align 4
%228 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %227) #10
%229 = load float, float* %19, align 4
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %226, float* %228, float %229) #10
br label %230
; <label>:230: ; preds = %225, %221, %215
%231 = bitcast i32* %27 to i8*
call void @llvm.lifetime.end(i64 4, i8* %231) #9
%232 = bitcast float* %19 to i8*
call void @llvm.lifetime.end(i64 4, i8* %232) #9
%233 = bitcast i32* %18 to i8*
call void @llvm.lifetime.end(i64 4, i8* %233) #9
%234 = bitcast i32* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %234) #9
%235 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %235) #9
%236 = bitcast i32* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %236) #9
br label %237
; <label>:237: ; preds = %230
%238 = load i32, i32* %13, align 4
%239 = add nsw i32 %238, 32
store i32 %239, i32* %13, align 4
br label %74
; <label>:240: ; preds = %78
%241 = bitcast i32* %12 to i8*
call void @llvm.lifetime.end(i64 4, i8* %241) #9
%242 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %242) #9
%243 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %243) #9
%244 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %244) #9
%245 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %245) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca i32
%15 = alloca i32, align 4
%16 = alloca i32, align 4
%17 = alloca i32, align 4
%18 = alloca i32, align 4
%19 = alloca float, align 4
%20 = alloca i32, align 4
%21 = alloca i32, align 4
%22 = alloca i32, align 4
%23 = alloca i32, align 4
%24 = alloca i32, align 4
%25 = alloca i32, align 4
%26 = alloca i32, align 4
%27 = alloca i32, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%29 = icmp eq i32 %28, 256
br i1 %29, label %30, label %31
; <label>:30: ; preds = %5
br label %32
; <label>:31: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%34 = icmp eq i32 %33, 1
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%44 = icmp eq i32 %43, 32
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%49 = icmp eq i32 %48, 1
br i1 %49, label %50, label %51
; <label>:50: ; preds = %47
br label %52
; <label>:51: ; preds = %47
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %52
; <label>:52: ; preds = %51, %50
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%54 = icmp eq i32 %53, 1
br i1 %54, label %55, label %56
; <label>:55: ; preds = %52
br label %57
; <label>:56: ; preds = %52
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %57
; <label>:57: ; preds = %56, %55
%58 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %58) #9
store i32 16, i32* %8, align 4
%59 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %7, align 4
%61 = add nsw i32 %60, 32768
%62 = sub nsw i32 %61, 1
%63 = sdiv i32 %62, 32768
store i32 %63, i32* %9, align 4
%64 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %64) #9
%65 = load i32, i32* %9, align 4
%66 = load i32, i32* %6, align 4
%67 = mul nsw i32 %65, %66
store i32 %67, i32* %10, align 4
%68 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %68) #9
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %69, i32* %11, align 4
%70 = bitcast i32* %12 to i8*
call void @llvm.lifetime.start(i64 4, i8* %70) #9
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %71, i32* %12, align 4
%72 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %72) #9
%73 = load i32, i32* %11, align 4
store i32 %73, i32* %13, align 4
br label %74
; <label>:74: ; preds = %232, %57
%75 = load i32, i32* %13, align 4
%76 = load i32, i32* %10, align 4
%77 = icmp slt i32 %75, %76
br i1 %77, label %80, label %78
; <label>:78: ; preds = %74
store i32 2, i32* %14, align 4
%79 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %79) #9
br label %235
; <label>:80: ; preds = %74
%81 = bitcast i32* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %81) #9
%82 = load i32, i32* %13, align 4
%83 = load i32, i32* %9, align 4
%84 = srem i32 %82, %83
store i32 %84, i32* %15, align 4
%85 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = load i32, i32* %13, align 4
%87 = load i32, i32* %9, align 4
%88 = sdiv i32 %86, %87
store i32 %88, i32* %16, align 4
%89 = bitcast i32* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %89) #9
%90 = load i32, i32* %15, align 4
%91 = mul nsw i32 %90, 256
%92 = mul nsw i32 %91, 128
%93 = load i32, i32* %12, align 4
%94 = add nsw i32 %92, %93
store i32 %94, i32* %17, align 4
%95 = bitcast i32* %18 to i8*
call void @llvm.lifetime.start(i64 4, i8* %95) #9
%96 = load i32, i32* %16, align 4
store i32 %96, i32* %18, align 4
%97 = bitcast float* %19 to i8*
call void @llvm.lifetime.start(i64 4, i8* %97) #9
%98 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10
store float %98, float* %19, align 4
%99 = load i32, i32* %18, align 4
%100 = load i32, i32* %6, align 4
%101 = icmp slt i32 %99, %100
br i1 %101, label %102, label %195
; <label>:102: ; preds = %80
%103 = bitcast i32* %20 to i8*
call void @llvm.lifetime.start(i64 4, i8* %103) #9
store i32 0, i32* %20, align 4
br label %104
; <label>:104: ; preds = %189, %102
%105 = load i32, i32* %20, align 4
%106 = icmp slt i32 %105, 128
br i1 %106, label %108, label %107
; <label>:107: ; preds = %104
store i32 5, i32* %14, align 4
br label %192
; <label>:108: ; preds = %104
%109 = bitcast i32* %21 to i8*
call void @llvm.lifetime.start(i64 4, i8* %109) #9
%110 = load i32, i32* %17, align 4
%111 = load i32, i32* %20, align 4
%112 = add nsw i32 %111, 16
%113 = sub nsw i32 %112, 1
%114 = mul nsw i32 256, %113
%115 = add nsw i32 %110, %114
store i32 %115, i32* %21, align 4
%116 = load i32, i32* %21, align 4
%117 = load i32, i32* %7, align 4
%118 = icmp sge i32 %116, %117
br i1 %118, label %119, label %156
; <label>:119: ; preds = %108
%120 = bitcast i32* %22 to i8*
call void @llvm.lifetime.start(i64 4, i8* %120) #9
store i32 0, i32* %22, align 4
br label %121
; <label>:121: ; preds = %150, %119
%122 = load i32, i32* %22, align 4
%123 = icmp slt i32 %122, 15
br i1 %123, label %125, label %124
; <label>:124: ; preds = %121
store i32 8, i32* %14, align 4
br label %153
; <label>:125: ; preds = %121
%126 = bitcast i32* %23 to i8*
call void @llvm.lifetime.start(i64 4, i8* %126) #9
%127 = load i32, i32* %17, align 4
%128 = load i32, i32* %20, align 4
%129 = load i32, i32* %22, align 4
%130 = add nsw i32 %128, %129
%131 = mul nsw i32 256, %130
%132 = add nsw i32 %127, %131
store i32 %132, i32* %23, align 4
%133 = load i32, i32* %23, align 4
%134 = load i32, i32* %7, align 4
%135 = icmp sge i32 %133, %134
br i1 %135, label %136, label %137
; <label>:136: ; preds = %125
store i32 8, i32* %14, align 4
br label %146
; <label>:137: ; preds = %125
%138 = load float, float* %19, align 4
%139 = load i32, i32* %18, align 4
%140 = load i32, i32* %7, align 4
%141 = mul nsw i32 %139, %140
%142 = load i32, i32* %23, align 4
%143 = add nsw i32 %141, %142
%144 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %143) #10
%145 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %138, float %144) #10
store float %145, float* %19, align 4
store i32 0, i32* %14, align 4
br label %146
; <label>:146: ; preds = %137, %136
%147 = bitcast i32* %23 to i8*
call void @llvm.lifetime.end(i64 4, i8* %147) #9
%148 = load i32, i32* %14, align 4
switch i32 %148, label %153 [
i32 0, label %149
]
; <label>:149: ; preds = %146
br label %150
; <label>:150: ; preds = %149
%151 = load i32, i32* %22, align 4
%152 = add nsw i32 %151, 1
store i32 %152, i32* %22, align 4
br label %121, !llvm.loop !76
; <label>:153: ; preds = %146, %124
%154 = bitcast i32* %22 to i8*
call void @llvm.lifetime.end(i64 4, i8* %154) #9
br label %155
; <label>:155: ; preds = %153
store i32 5, i32* %14, align 4
br label %185
; <label>:156: ; preds = %108
%157 = bitcast i32* %24 to i8*
call void @llvm.lifetime.start(i64 4, i8* %157) #9
store i32 0, i32* %24, align 4
br label %158
; <label>:158: ; preds = %180, %156
%159 = load i32, i32* %24, align 4
%160 = icmp slt i32 %159, 16
br i1 %160, label %163, label %161
; <label>:161: ; preds = %158
store i32 11, i32* %14, align 4
%162 = bitcast i32* %24 to i8*
call void @llvm.lifetime.end(i64 4, i8* %162) #9
br label %183
; <label>:163: ; preds = %158
%164 = bitcast i32* %25 to i8*
call void @llvm.lifetime.start(i64 4, i8* %164) #9
%165 = load i32, i32* %17, align 4
%166 = load i32, i32* %20, align 4
%167 = load i32, i32* %24, align 4
%168 = add nsw i32 %166, %167
%169 = mul nsw i32 256, %168
%170 = add nsw i32 %165, %169
store i32 %170, i32* %25, align 4
%171 = load float, float* %19, align 4
%172 = load i32, i32* %18, align 4
%173 = load i32, i32* %7, align 4
%174 = mul nsw i32 %172, %173
%175 = load i32, i32* %25, align 4
%176 = add nsw i32 %174, %175
%177 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %176) #10
%178 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %171, float %177) #10
store float %178, float* %19, align 4
%179 = bitcast i32* %25 to i8*
call void @llvm.lifetime.end(i64 4, i8* %179) #9
br label %180
; <label>:180: ; preds = %163
%181 = load i32, i32* %24, align 4
%182 = add nsw i32 %181, 1
store i32 %182, i32* %24, align 4
br label %158, !llvm.loop !77
; <label>:183: ; preds = %161
br label %184
; <label>:184: ; preds = %183
store i32 0, i32* %14, align 4
br label %185
; <label>:185: ; preds = %184, %155
%186 = bitcast i32* %21 to i8*
call void @llvm.lifetime.end(i64 4, i8* %186) #9
%187 = load i32, i32* %14, align 4
switch i32 %187, label %192 [
i32 0, label %188
]
; <label>:188: ; preds = %185
br label %189
; <label>:189: ; preds = %188
%190 = load i32, i32* %20, align 4
%191 = add nsw i32 %190, 16
store i32 %191, i32* %20, align 4
br label %104, !llvm.loop !78
; <label>:192: ; preds = %185, %107
%193 = bitcast i32* %20 to i8*
call void @llvm.lifetime.end(i64 4, i8* %193) #9
br label %194
; <label>:194: ; preds = %192
br label %195
; <label>:195: ; preds = %194, %80
%196 = bitcast i32* %26 to i8*
call void @llvm.lifetime.start(i64 4, i8* %196) #9
store i32 16, i32* %26, align 4
br label %197
; <label>:197: ; preds = %208, %195
%198 = load i32, i32* %26, align 4
%199 = icmp sgt i32 %198, 0
br i1 %199, label %202, label %200
; <label>:200: ; preds = %197
store i32 14, i32* %14, align 4
%201 = bitcast i32* %26 to i8*
call void @llvm.lifetime.end(i64 4, i8* %201) #9
br label %211
; <label>:202: ; preds = %197
%203 = load float, float* %19, align 4
%204 = load i32, i32* %26, align 4
%205 = call float @_ZL11__shfl_downfji(float %203, i32 %204, i32 32) #10
%206 = load float, float* %19, align 4
%207 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %205, float %206) #10
store float %207, float* %19, align 4
br label %208
; <label>:208: ; preds = %202
%209 = load i32, i32* %26, align 4
%210 = sdiv i32 %209, 2
store i32 %210, i32* %26, align 4
br label %197, !llvm.loop !79
; <label>:211: ; preds = %200
%212 = bitcast i32* %27 to i8*
call void @llvm.lifetime.start(i64 4, i8* %212) #9
%213 = load i32, i32* %12, align 4
%214 = and i32 %213, 31
store i32 %214, i32* %27, align 4
%215 = load i32, i32* %27, align 4
%216 = icmp eq i32 %215, 0
br i1 %216, label %217, label %225
; <label>:217: ; preds = %211
%218 = load i32, i32* %18, align 4
%219 = load i32, i32* %6, align 4
%220 = icmp slt i32 %218, %219
br i1 %220, label %221, label %225
; <label>:221: ; preds = %217
%222 = load i32, i32* %18, align 4
%223 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %222) #10
%224 = load float, float* %19, align 4
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %223, float %224) #10
br label %225
; <label>:225: ; preds = %221, %217, %211
%226 = bitcast i32* %27 to i8*
call void @llvm.lifetime.end(i64 4, i8* %226) #9
%227 = bitcast float* %19 to i8*
call void @llvm.lifetime.end(i64 4, i8* %227) #9
%228 = bitcast i32* %18 to i8*
call void @llvm.lifetime.end(i64 4, i8* %228) #9
%229 = bitcast i32* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %229) #9
%230 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %230) #9
%231 = bitcast i32* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %231) #9
br label %232
; <label>:232: ; preds = %225
%233 = load i32, i32* %13, align 4
%234 = add nsw i32 %233, 32
store i32 %234, i32* %13, align 4
br label %74
; <label>:235: ; preds = %78
%236 = bitcast i32* %12 to i8*
call void @llvm.lifetime.end(i64 4, i8* %236) #9
%237 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %237) #9
%238 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %238) #9
%239 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %239) #9
%240 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %240) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32
%13 = alloca i32, align 4
%14 = alloca i32, align 4
%15 = alloca float, align 4
%16 = alloca i32, align 4
%17 = alloca float, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%19 = icmp eq i32 %18, 256
br i1 %19, label %20, label %21
; <label>:20: ; preds = %5
br label %22
; <label>:21: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %22
; <label>:22: ; preds = %21, %20
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%24 = icmp eq i32 %23, 1
br i1 %24, label %25, label %26
; <label>:25: ; preds = %22
br label %27
; <label>:26: ; preds = %22
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %27
; <label>:27: ; preds = %26, %25
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%29 = icmp eq i32 %28, 1
br i1 %29, label %30, label %31
; <label>:30: ; preds = %27
br label %32
; <label>:31: ; preds = %27
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%34 = icmp eq i32 %33, 128
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%44 = icmp eq i32 %43, 1
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %48) #9
%49 = load i32, i32* %6, align 4
%50 = add nsw i32 %49, 16
%51 = sub nsw i32 %50, 1
%52 = sdiv i32 %51, 16
%53 = load i32, i32* %7, align 4
%54 = mul nsw i32 %52, %53
store i32 %54, i32* %8, align 4
%55 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %55) #9
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %56, i32* %9, align 4
%57 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %57) #9
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %58, i32* %10, align 4
%59 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %9, align 4
%61 = mul nsw i32 %60, 256
%62 = load i32, i32* %10, align 4
%63 = add nsw i32 %61, %62
store i32 %63, i32* %11, align 4
br label %64
; <label>:64: ; preds = %135, %47
%65 = load i32, i32* %11, align 4
%66 = load i32, i32* %8, align 4
%67 = icmp slt i32 %65, %66
br i1 %67, label %70, label %68
; <label>:68: ; preds = %64
store i32 2, i32* %12, align 4
%69 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %69) #9
br label %138
; <label>:70: ; preds = %64
%71 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %71) #9
%72 = load i32, i32* %11, align 4
%73 = load i32, i32* %7, align 4
%74 = srem i32 %72, %73
store i32 %74, i32* %13, align 4
%75 = bitcast i32* %14 to i8*
call void @llvm.lifetime.start(i64 4, i8* %75) #9
%76 = load i32, i32* %11, align 4
%77 = load i32, i32* %7, align 4
%78 = sdiv i32 %76, %77
%79 = load i32, i32* %6, align 4
%80 = add nsw i32 %79, 16
%81 = sub nsw i32 %80, 1
%82 = sdiv i32 %81, 16
%83 = srem i32 %78, %82
%84 = mul nsw i32 %83, 16
store i32 %84, i32* %14, align 4
%85 = bitcast float* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%87 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %86) #10
store float %87, float* %15, align 4
%88 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %88) #9
store i32 0, i32* %16, align 4
br label %89
; <label>:89: ; preds = %124, %70
%90 = load i32, i32* %16, align 4
%91 = icmp slt i32 %90, 16
br i1 %91, label %94, label %92
; <label>:92: ; preds = %89
store i32 5, i32* %12, align 4
%93 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %93) #9
br label %127
; <label>:94: ; preds = %89
%95 = bitcast float* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %95) #9
%96 = load i32, i32* %13, align 4
%97 = load i32, i32* %7, align 4
%98 = icmp slt i32 %96, %97
br i1 %98, label %99, label %114
; <label>:99: ; preds = %94
%100 = load i32, i32* %14, align 4
%101 = load i32, i32* %16, align 4
%102 = add nsw i32 %100, %101
%103 = load i32, i32* %6, align 4
%104 = icmp slt i32 %102, %103
br i1 %104, label %105, label %114
; <label>:105: ; preds = %99
%106 = load i32, i32* %14, align 4
%107 = load i32, i32* %16, align 4
%108 = add nsw i32 %106, %107
%109 = load i32, i32* %7, align 4
%110 = mul nsw i32 %108, %109
%111 = load i32, i32* %13, align 4
%112 = add nsw i32 %110, %111
%113 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %112) #10
br label %117
; <label>:114: ; preds = %99, %94
%115 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%116 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %115) #10
br label %117
; <label>:117: ; preds = %114, %105
%118 = phi float [ %113, %105 ], [ %116, %114 ]
store float %118, float* %17, align 4
%119 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%120 = load float, float* %15, align 4
%121 = load float, float* %17, align 4
%122 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %119, float %120, float %121) #10
store float %122, float* %15, align 4
%123 = bitcast float* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %123) #9
br label %124
; <label>:124: ; preds = %117
%125 = load i32, i32* %16, align 4
%126 = add nsw i32 %125, 1
store i32 %126, i32* %16, align 4
br label %89
; <label>:127: ; preds = %92
%128 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%129 = load i32, i32* %13, align 4
%130 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %129) #10
%131 = load float, float* %15, align 4
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %128, float* %130, float %131) #10
%132 = bitcast float* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %132) #9
%133 = bitcast i32* %14 to i8*
call void @llvm.lifetime.end(i64 4, i8* %133) #9
%134 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %134) #9
br label %135
; <label>:135: ; preds = %127
%136 = load i32, i32* %11, align 4
%137 = add nsw i32 %136, 32768
store i32 %137, i32* %11, align 4
br label %64
; <label>:138: ; preds = %68
%139 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %139) #9
%140 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %140) #9
%141 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %141) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32
%13 = alloca i32, align 4
%14 = alloca i32, align 4
%15 = alloca float, align 4
%16 = alloca i32, align 4
%17 = alloca float, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%19 = icmp eq i32 %18, 256
br i1 %19, label %20, label %21
; <label>:20: ; preds = %5
br label %22
; <label>:21: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %22
; <label>:22: ; preds = %21, %20
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%24 = icmp eq i32 %23, 1
br i1 %24, label %25, label %26
; <label>:25: ; preds = %22
br label %27
; <label>:26: ; preds = %22
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %27
; <label>:27: ; preds = %26, %25
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%29 = icmp eq i32 %28, 1
br i1 %29, label %30, label %31
; <label>:30: ; preds = %27
br label %32
; <label>:31: ; preds = %27
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%34 = icmp eq i32 %33, 128
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%44 = icmp eq i32 %43, 1
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %48) #9
%49 = load i32, i32* %6, align 4
%50 = add nsw i32 %49, 16
%51 = sub nsw i32 %50, 1
%52 = sdiv i32 %51, 16
%53 = load i32, i32* %7, align 4
%54 = mul nsw i32 %52, %53
store i32 %54, i32* %8, align 4
%55 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %55) #9
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %56, i32* %9, align 4
%57 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %57) #9
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %58, i32* %10, align 4
%59 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %9, align 4
%61 = mul nsw i32 %60, 256
%62 = load i32, i32* %10, align 4
%63 = add nsw i32 %61, %62
store i32 %63, i32* %11, align 4
br label %64
; <label>:64: ; preds = %131, %47
%65 = load i32, i32* %11, align 4
%66 = load i32, i32* %8, align 4
%67 = icmp slt i32 %65, %66
br i1 %67, label %70, label %68
; <label>:68: ; preds = %64
store i32 2, i32* %12, align 4
%69 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %69) #9
br label %134
; <label>:70: ; preds = %64
%71 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %71) #9
%72 = load i32, i32* %11, align 4
%73 = load i32, i32* %7, align 4
%74 = srem i32 %72, %73
store i32 %74, i32* %13, align 4
%75 = bitcast i32* %14 to i8*
call void @llvm.lifetime.start(i64 4, i8* %75) #9
%76 = load i32, i32* %11, align 4
%77 = load i32, i32* %7, align 4
%78 = sdiv i32 %76, %77
%79 = load i32, i32* %6, align 4
%80 = add nsw i32 %79, 16
%81 = sub nsw i32 %80, 1
%82 = sdiv i32 %81, 16
%83 = srem i32 %78, %82
%84 = mul nsw i32 %83, 16
store i32 %84, i32* %14, align 4
%85 = bitcast float* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10
store float %86, float* %15, align 4
%87 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %87) #9
store i32 0, i32* %16, align 4
br label %88
; <label>:88: ; preds = %121, %70
%89 = load i32, i32* %16, align 4
%90 = icmp slt i32 %89, 16
br i1 %90, label %93, label %91
; <label>:91: ; preds = %88
store i32 5, i32* %12, align 4
%92 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %92) #9
br label %124
; <label>:93: ; preds = %88
%94 = bitcast float* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %94) #9
%95 = load i32, i32* %13, align 4
%96 = load i32, i32* %7, align 4
%97 = icmp slt i32 %95, %96
br i1 %97, label %98, label %113
; <label>:98: ; preds = %93
%99 = load i32, i32* %14, align 4
%100 = load i32, i32* %16, align 4
%101 = add nsw i32 %99, %100
%102 = load i32, i32* %6, align 4
%103 = icmp slt i32 %101, %102
br i1 %103, label %104, label %113
; <label>:104: ; preds = %98
%105 = load i32, i32* %14, align 4
%106 = load i32, i32* %16, align 4
%107 = add nsw i32 %105, %106
%108 = load i32, i32* %7, align 4
%109 = mul nsw i32 %107, %108
%110 = load i32, i32* %13, align 4
%111 = add nsw i32 %109, %110
%112 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %111) #10
br label %115
; <label>:113: ; preds = %98, %93
%114 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10
br label %115
; <label>:115: ; preds = %113, %104
%116 = phi float [ %112, %104 ], [ %114, %113 ]
store float %116, float* %17, align 4
%117 = load float, float* %15, align 4
%118 = load float, float* %17, align 4
%119 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %117, float %118) #10
store float %119, float* %15, align 4
%120 = bitcast float* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %120) #9
br label %121
; <label>:121: ; preds = %115
%122 = load i32, i32* %16, align 4
%123 = add nsw i32 %122, 1
store i32 %123, i32* %16, align 4
br label %88
; <label>:124: ; preds = %91
%125 = load i32, i32* %13, align 4
%126 = call dereferenceable(4) float* @_ZN5Eigen8internal10PtrWrapperIfiE8coeffRefEi(%"struct.Eigen::internal::PtrWrapper"* %4, i32 %125) #10
%127 = load float, float* %15, align 4
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %126, float %127) #10
%128 = bitcast float* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %128) #9
%129 = bitcast i32* %14 to i8*
call void @llvm.lifetime.end(i64 4, i8* %129) #9
%130 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %130) #9
br label %131
; <label>:131: ; preds = %124
%132 = load i32, i32* %11, align 4
%133 = add nsw i32 %132, 32768
store i32 %133, i32* %11, align 4
br label %64
; <label>:134: ; preds = %68
%135 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %135) #9
%136 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %136) #9
%137 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %137) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.15"* byval align 8, i32) #0 comdat {
%3 = alloca i32, align 4
%4 = alloca i32, align 4
%5 = alloca i32, align 4
%6 = alloca %"struct.Eigen::TensorEvaluator.15", align 8
%7 = alloca i8, align 1
%8 = alloca %"struct.Eigen::TensorEvaluator.15", align 8
store i32 %1, i32* %3, align 4
%9 = bitcast i32* %4 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
%10 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%11 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%12 = mul i32 %10, %11
%13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%14 = add i32 %12, %13
store i32 %14, i32* %4, align 4
%15 = bitcast i32* %5 to i8*
call void @llvm.lifetime.start(i64 4, i8* %15) #9
%16 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%17 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%18 = mul i32 %16, %17
store i32 %18, i32* %5, align 4
%19 = bitcast %"struct.Eigen::TensorEvaluator.15"* %6 to i8*
call void @llvm.lifetime.start(i64 168, i8* %19) #9
%20 = bitcast %"struct.Eigen::TensorEvaluator.15"* %6 to i8*
%21 = bitcast %"struct.Eigen::TensorEvaluator.15"* %0 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %20, i8* %21, i64 168, i32 8, i1 false)
call void @llvm.lifetime.start(i64 1, i8* %7) #9
store i8 0, i8* %7, align 1
%22 = bitcast %"struct.Eigen::TensorEvaluator.15"* %8 to i8*
%23 = bitcast %"struct.Eigen::TensorEvaluator.15"* %6 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %22, i8* %23, i64 168, i32 8, i1 false)
%24 = load i32, i32* %4, align 4
%25 = load i32, i32* %3, align 4
%26 = load i32, i32* %5, align 4
call void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii(%"struct.Eigen::TensorEvaluator.15"* byval align 8 %8, i32 %24, i32 %25, i32 %26) #10
call void @llvm.lifetime.end(i64 1, i8* %7) #9
%27 = bitcast %"struct.Eigen::TensorEvaluator.15"* %6 to i8*
call void @llvm.lifetime.end(i64 168, i8* %27) #9
%28 = bitcast i32* %5 to i8*
call void @llvm.lifetime.end(i64 4, i8* %28) #9
%29 = bitcast i32* %4 to i8*
call void @llvm.lifetime.end(i64 4, i8* %29) #9
ret void
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii(%"struct.Eigen::TensorEvaluator.15"* byval align 8, i32, i32, i32) #2 comdat align 2 {
%5 = alloca i32, align 4
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
store i32 %1, i32* %5, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%9 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
%10 = load i32, i32* %5, align 4
store i32 %10, i32* %8, align 4
br label %11
; <label>:11: ; preds = %19, %4
%12 = load i32, i32* %8, align 4
%13 = load i32, i32* %6, align 4
%14 = icmp slt i32 %12, %13
br i1 %14, label %17, label %15
; <label>:15: ; preds = %11
%16 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %16) #9
br label %23
; <label>:17: ; preds = %11
%18 = load i32, i32* %8, align 4
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.15"* %0, i32 %18) #10
br label %19
; <label>:19: ; preds = %17
%20 = load i32, i32* %7, align 4
%21 = load i32, i32* %8, align 4
%22 = add nsw i32 %21, %20
store i32 %22, i32* %8, align 4
br label %11
; <label>:23: ; preds = %15
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.15"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator.15"*, align 8
%4 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator.15"* %0, %"struct.Eigen::TensorEvaluator.15"** %3, align 8
store i32 %1, i32* %4, align 4
%5 = load %"struct.Eigen::TensorEvaluator.15"*, %"struct.Eigen::TensorEvaluator.15"** %3, align 8
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.15", %"struct.Eigen::TensorEvaluator.15"* %5, i32 0, i32 1
%7 = load i32, i32* %4, align 4
%8 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.17"* %6, i32 %7) #10
%9 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.15", %"struct.Eigen::TensorEvaluator.15"* %5, i32 0, i32 0
%10 = load i32, i32* %4, align 4
%11 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"* %9, i32 %10) #10
store float %8, float* %11, align 4
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.17"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator.17"*, align 8
%4 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator.17"* %0, %"struct.Eigen::TensorEvaluator.17"** %3, align 8
store i32 %1, i32* %4, align 4
%5 = load %"struct.Eigen::TensorEvaluator.17"*, %"struct.Eigen::TensorEvaluator.17"** %3, align 8
%6 = load i32, i32* %4, align 4
%7 = sext i32 %6 to i64
%8 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.17", %"struct.Eigen::TensorEvaluator.17"* %5, i32 0, i32 3
%9 = load float*, float** %8, align 8
%10 = getelementptr inbounds float, float* %9, i64 %7
%11 = load float, float* %10, align 4
ret float %11
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator.16"*, align 8
%4 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator.16"* %0, %"struct.Eigen::TensorEvaluator.16"** %3, align 8
store i32 %1, i32* %4, align 4
%5 = load %"struct.Eigen::TensorEvaluator.16"*, %"struct.Eigen::TensorEvaluator.16"** %3, align 8
%6 = load i32, i32* %4, align 4
%7 = sext i32 %6 to i64
%8 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %5, i32 0, i32 0
%9 = load float*, float** %8, align 8
%10 = getelementptr inbounds float, float* %9, i64 %7
ret float* %10
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.24"* byval align 8, i32) #0 comdat {
%3 = alloca i32, align 4
%4 = alloca i32, align 4
%5 = alloca i32, align 4
%6 = alloca %"struct.Eigen::TensorEvaluator.24", align 8
%7 = alloca i8, align 1
%8 = alloca %"struct.Eigen::TensorEvaluator.24", align 8
store i32 %1, i32* %3, align 4
%9 = bitcast i32* %4 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
%10 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%11 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%12 = mul i32 %10, %11
%13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%14 = add i32 %12, %13
store i32 %14, i32* %4, align 4
%15 = bitcast i32* %5 to i8*
call void @llvm.lifetime.start(i64 4, i8* %15) #9
%16 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%17 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%18 = mul i32 %16, %17
store i32 %18, i32* %5, align 4
%19 = bitcast %"struct.Eigen::TensorEvaluator.24"* %6 to i8*
call void @llvm.lifetime.start(i64 136, i8* %19) #9
%20 = bitcast %"struct.Eigen::TensorEvaluator.24"* %6 to i8*
%21 = bitcast %"struct.Eigen::TensorEvaluator.24"* %0 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %20, i8* %21, i64 136, i32 8, i1 false)
call void @llvm.lifetime.start(i64 1, i8* %7) #9
store i8 0, i8* %7, align 1
%22 = bitcast %"struct.Eigen::TensorEvaluator.24"* %8 to i8*
%23 = bitcast %"struct.Eigen::TensorEvaluator.24"* %6 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %22, i8* %23, i64 136, i32 8, i1 false)
%24 = load i32, i32* %4, align 4
%25 = load i32, i32* %3, align 4
%26 = load i32, i32* %5, align 4
call void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii(%"struct.Eigen::TensorEvaluator.24"* byval align 8 %8, i32 %24, i32 %25, i32 %26) #10
call void @llvm.lifetime.end(i64 1, i8* %7) #9
%27 = bitcast %"struct.Eigen::TensorEvaluator.24"* %6 to i8*
call void @llvm.lifetime.end(i64 136, i8* %27) #9
%28 = bitcast i32* %5 to i8*
call void @llvm.lifetime.end(i64 4, i8* %28) #9
%29 = bitcast i32* %4 to i8*
call void @llvm.lifetime.end(i64 4, i8* %29) #9
ret void
}
; Function Attrs: alwaysinline convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii(%"struct.Eigen::TensorEvaluator.24"* byval align 8, i32, i32, i32) #2 comdat align 2 {
%5 = alloca i32, align 4
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
store i32 %1, i32* %5, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%9 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %9) #9
%10 = load i32, i32* %5, align 4
store i32 %10, i32* %8, align 4
br label %11
; <label>:11: ; preds = %19, %4
%12 = load i32, i32* %8, align 4
%13 = load i32, i32* %6, align 4
%14 = icmp slt i32 %12, %13
br i1 %14, label %17, label %15
; <label>:15: ; preds = %11
%16 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %16) #9
br label %23
; <label>:17: ; preds = %11
%18 = load i32, i32* %8, align 4
call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.24"* %0, i32 %18) #10
br label %19
; <label>:19: ; preds = %17
%20 = load i32, i32* %7, align 4
%21 = load i32, i32* %8, align 4
%22 = add nsw i32 %21, %20
store i32 %22, i32* %8, align 4
br label %11
; <label>:23: ; preds = %15
ret void
}
; Function Attrs: convergent inlinehint nounwind
define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator.24"*, i32) #4 comdat align 2 {
%3 = alloca %"struct.Eigen::TensorEvaluator.24"*, align 8
%4 = alloca i32, align 4
store %"struct.Eigen::TensorEvaluator.24"* %0, %"struct.Eigen::TensorEvaluator.24"** %3, align 8
store i32 %1, i32* %4, align 4
%5 = load %"struct.Eigen::TensorEvaluator.24"*, %"struct.Eigen::TensorEvaluator.24"** %3, align 8
%6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %5, i32 0, i32 1
%7 = load i32, i32* %4, align 4
%8 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.12"* %6, i32 %7) #10
%9 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %5, i32 0, i32 0
%10 = load i32, i32* %4, align 4
%11 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"* %9, i32 %10) #10
store float %8, float* %11, align 4
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_(float, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #0 comdat {
%4 = alloca float, align 4
%5 = alloca i32, align 4
%6 = alloca i32, align 4
%7 = alloca i32, align 4
store float %0, float* %4, align 4
store i32 %1, i32* %5, align 4
%8 = bitcast i32* %6 to i8*
call void @llvm.lifetime.start(i64 4, i8* %8) #9
%9 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
%10 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%11 = mul i32 %9, %10
%12 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
%13 = add i32 %11, %12
store i32 %13, i32* %6, align 4
%14 = bitcast i32* %7 to i8*
call void @llvm.lifetime.start(i64 4, i8* %14) #9
%15 = load i32, i32* %6, align 4
store i32 %15, i32* %7, align 4
br label %16
; <label>:16: ; preds = %26, %3
%17 = load i32, i32* %7, align 4
%18 = load i32, i32* %5, align 4
%19 = icmp slt i32 %17, %18
br i1 %19, label %22, label %20
; <label>:20: ; preds = %16
%21 = bitcast i32* %7 to i8*
call void @llvm.lifetime.end(i64 4, i8* %21) #9
br label %32
; <label>:22: ; preds = %16
%23 = load float, float* %4, align 4
%24 = load i32, i32* %7, align 4
%25 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"* %2, i32 %24) #10
store float %23, float* %25, align 4
br label %26
; <label>:26: ; preds = %22
%27 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%29 = mul i32 %27, %28
%30 = load i32, i32* %7, align 4
%31 = add i32 %30, %29
store i32 %31, i32* %7, align 4
br label %16
; <label>:32: ; preds = %20
%33 = bitcast i32* %6 to i8*
call void @llvm.lifetime.end(i64 4, i8* %33) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca i32
%15 = alloca i32, align 4
%16 = alloca i32, align 4
%17 = alloca i32, align 4
%18 = alloca i32, align 4
%19 = alloca float, align 4
%20 = alloca i32, align 4
%21 = alloca i32, align 4
%22 = alloca i32, align 4
%23 = alloca i32, align 4
%24 = alloca i32, align 4
%25 = alloca i32, align 4
%26 = alloca i32, align 4
%27 = alloca i32, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%29 = icmp eq i32 %28, 256
br i1 %29, label %30, label %31
; <label>:30: ; preds = %5
br label %32
; <label>:31: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%34 = icmp eq i32 %33, 1
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%44 = icmp eq i32 %43, 32
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%49 = icmp eq i32 %48, 1
br i1 %49, label %50, label %51
; <label>:50: ; preds = %47
br label %52
; <label>:51: ; preds = %47
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %52
; <label>:52: ; preds = %51, %50
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%54 = icmp eq i32 %53, 1
br i1 %54, label %55, label %56
; <label>:55: ; preds = %52
br label %57
; <label>:56: ; preds = %52
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %57
; <label>:57: ; preds = %56, %55
%58 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %58) #9
store i32 16, i32* %8, align 4
%59 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %7, align 4
%61 = add nsw i32 %60, 32768
%62 = sub nsw i32 %61, 1
%63 = sdiv i32 %62, 32768
store i32 %63, i32* %9, align 4
%64 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %64) #9
%65 = load i32, i32* %9, align 4
%66 = load i32, i32* %6, align 4
%67 = mul nsw i32 %65, %66
store i32 %67, i32* %10, align 4
%68 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %68) #9
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %69, i32* %11, align 4
%70 = bitcast i32* %12 to i8*
call void @llvm.lifetime.start(i64 4, i8* %70) #9
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %71, i32* %12, align 4
%72 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %72) #9
%73 = load i32, i32* %11, align 4
store i32 %73, i32* %13, align 4
br label %74
; <label>:74: ; preds = %237, %57
%75 = load i32, i32* %13, align 4
%76 = load i32, i32* %10, align 4
%77 = icmp slt i32 %75, %76
br i1 %77, label %80, label %78
; <label>:78: ; preds = %74
store i32 2, i32* %14, align 4
%79 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %79) #9
br label %240
; <label>:80: ; preds = %74
%81 = bitcast i32* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %81) #9
%82 = load i32, i32* %13, align 4
%83 = load i32, i32* %9, align 4
%84 = srem i32 %82, %83
store i32 %84, i32* %15, align 4
%85 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = load i32, i32* %13, align 4
%87 = load i32, i32* %9, align 4
%88 = sdiv i32 %86, %87
store i32 %88, i32* %16, align 4
%89 = bitcast i32* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %89) #9
%90 = load i32, i32* %15, align 4
%91 = mul nsw i32 %90, 256
%92 = mul nsw i32 %91, 128
%93 = load i32, i32* %12, align 4
%94 = add nsw i32 %92, %93
store i32 %94, i32* %17, align 4
%95 = bitcast i32* %18 to i8*
call void @llvm.lifetime.start(i64 4, i8* %95) #9
%96 = load i32, i32* %16, align 4
store i32 %96, i32* %18, align 4
%97 = bitcast float* %19 to i8*
call void @llvm.lifetime.start(i64 4, i8* %97) #9
%98 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%99 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %98) #10
store float %99, float* %19, align 4
%100 = load i32, i32* %18, align 4
%101 = load i32, i32* %6, align 4
%102 = icmp slt i32 %100, %101
br i1 %102, label %103, label %198
; <label>:103: ; preds = %80
%104 = bitcast i32* %20 to i8*
call void @llvm.lifetime.start(i64 4, i8* %104) #9
store i32 0, i32* %20, align 4
br label %105
; <label>:105: ; preds = %192, %103
%106 = load i32, i32* %20, align 4
%107 = icmp slt i32 %106, 128
br i1 %107, label %109, label %108
; <label>:108: ; preds = %105
store i32 5, i32* %14, align 4
br label %195
; <label>:109: ; preds = %105
%110 = bitcast i32* %21 to i8*
call void @llvm.lifetime.start(i64 4, i8* %110) #9
%111 = load i32, i32* %17, align 4
%112 = load i32, i32* %20, align 4
%113 = add nsw i32 %112, 16
%114 = sub nsw i32 %113, 1
%115 = mul nsw i32 256, %114
%116 = add nsw i32 %111, %115
store i32 %116, i32* %21, align 4
%117 = load i32, i32* %21, align 4
%118 = load i32, i32* %7, align 4
%119 = icmp sge i32 %117, %118
br i1 %119, label %120, label %158
; <label>:120: ; preds = %109
%121 = bitcast i32* %22 to i8*
call void @llvm.lifetime.start(i64 4, i8* %121) #9
store i32 0, i32* %22, align 4
br label %122
; <label>:122: ; preds = %152, %120
%123 = load i32, i32* %22, align 4
%124 = icmp slt i32 %123, 15
br i1 %124, label %126, label %125
; <label>:125: ; preds = %122
store i32 8, i32* %14, align 4
br label %155
; <label>:126: ; preds = %122
%127 = bitcast i32* %23 to i8*
call void @llvm.lifetime.start(i64 4, i8* %127) #9
%128 = load i32, i32* %17, align 4
%129 = load i32, i32* %20, align 4
%130 = load i32, i32* %22, align 4
%131 = add nsw i32 %129, %130
%132 = mul nsw i32 256, %131
%133 = add nsw i32 %128, %132
store i32 %133, i32* %23, align 4
%134 = load i32, i32* %23, align 4
%135 = load i32, i32* %7, align 4
%136 = icmp sge i32 %134, %135
br i1 %136, label %137, label %138
; <label>:137: ; preds = %126
store i32 8, i32* %14, align 4
br label %148
; <label>:138: ; preds = %126
%139 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%140 = load float, float* %19, align 4
%141 = load i32, i32* %18, align 4
%142 = load i32, i32* %7, align 4
%143 = mul nsw i32 %141, %142
%144 = load i32, i32* %23, align 4
%145 = add nsw i32 %143, %144
%146 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %145) #10
%147 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %139, float %140, float %146) #10
store float %147, float* %19, align 4
store i32 0, i32* %14, align 4
br label %148
; <label>:148: ; preds = %138, %137
%149 = bitcast i32* %23 to i8*
call void @llvm.lifetime.end(i64 4, i8* %149) #9
%150 = load i32, i32* %14, align 4
switch i32 %150, label %155 [
i32 0, label %151
]
; <label>:151: ; preds = %148
br label %152
; <label>:152: ; preds = %151
%153 = load i32, i32* %22, align 4
%154 = add nsw i32 %153, 1
store i32 %154, i32* %22, align 4
br label %122, !llvm.loop !80
; <label>:155: ; preds = %148, %125
%156 = bitcast i32* %22 to i8*
call void @llvm.lifetime.end(i64 4, i8* %156) #9
br label %157
; <label>:157: ; preds = %155
store i32 5, i32* %14, align 4
br label %188
; <label>:158: ; preds = %109
%159 = bitcast i32* %24 to i8*
call void @llvm.lifetime.start(i64 4, i8* %159) #9
store i32 0, i32* %24, align 4
br label %160
; <label>:160: ; preds = %183, %158
%161 = load i32, i32* %24, align 4
%162 = icmp slt i32 %161, 16
br i1 %162, label %165, label %163
; <label>:163: ; preds = %160
store i32 11, i32* %14, align 4
%164 = bitcast i32* %24 to i8*
call void @llvm.lifetime.end(i64 4, i8* %164) #9
br label %186
; <label>:165: ; preds = %160
%166 = bitcast i32* %25 to i8*
call void @llvm.lifetime.start(i64 4, i8* %166) #9
%167 = load i32, i32* %17, align 4
%168 = load i32, i32* %20, align 4
%169 = load i32, i32* %24, align 4
%170 = add nsw i32 %168, %169
%171 = mul nsw i32 256, %170
%172 = add nsw i32 %167, %171
store i32 %172, i32* %25, align 4
%173 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%174 = load float, float* %19, align 4
%175 = load i32, i32* %18, align 4
%176 = load i32, i32* %7, align 4
%177 = mul nsw i32 %175, %176
%178 = load i32, i32* %25, align 4
%179 = add nsw i32 %177, %178
%180 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %179) #10
%181 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %173, float %174, float %180) #10
store float %181, float* %19, align 4
%182 = bitcast i32* %25 to i8*
call void @llvm.lifetime.end(i64 4, i8* %182) #9
br label %183
; <label>:183: ; preds = %165
%184 = load i32, i32* %24, align 4
%185 = add nsw i32 %184, 1
store i32 %185, i32* %24, align 4
br label %160, !llvm.loop !81
; <label>:186: ; preds = %163
br label %187
; <label>:187: ; preds = %186
store i32 0, i32* %14, align 4
br label %188
; <label>:188: ; preds = %187, %157
%189 = bitcast i32* %21 to i8*
call void @llvm.lifetime.end(i64 4, i8* %189) #9
%190 = load i32, i32* %14, align 4
switch i32 %190, label %195 [
i32 0, label %191
]
; <label>:191: ; preds = %188
br label %192
; <label>:192: ; preds = %191
%193 = load i32, i32* %20, align 4
%194 = add nsw i32 %193, 16
store i32 %194, i32* %20, align 4
br label %105, !llvm.loop !82
; <label>:195: ; preds = %188, %108
%196 = bitcast i32* %20 to i8*
call void @llvm.lifetime.end(i64 4, i8* %196) #9
br label %197
; <label>:197: ; preds = %195
br label %198
; <label>:198: ; preds = %197, %80
%199 = bitcast i32* %26 to i8*
call void @llvm.lifetime.start(i64 4, i8* %199) #9
store i32 16, i32* %26, align 4
br label %200
; <label>:200: ; preds = %212, %198
%201 = load i32, i32* %26, align 4
%202 = icmp sgt i32 %201, 0
br i1 %202, label %205, label %203
; <label>:203: ; preds = %200
store i32 14, i32* %14, align 4
%204 = bitcast i32* %26 to i8*
call void @llvm.lifetime.end(i64 4, i8* %204) #9
br label %215
; <label>:205: ; preds = %200
%206 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%207 = load float, float* %19, align 4
%208 = load i32, i32* %26, align 4
%209 = call float @_ZL11__shfl_downfji(float %207, i32 %208, i32 32) #10
%210 = load float, float* %19, align 4
%211 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %206, float %209, float %210) #10
store float %211, float* %19, align 4
br label %212
; <label>:212: ; preds = %205
%213 = load i32, i32* %26, align 4
%214 = sdiv i32 %213, 2
store i32 %214, i32* %26, align 4
br label %200, !llvm.loop !83
; <label>:215: ; preds = %203
%216 = bitcast i32* %27 to i8*
call void @llvm.lifetime.start(i64 4, i8* %216) #9
%217 = load i32, i32* %12, align 4
%218 = and i32 %217, 31
store i32 %218, i32* %27, align 4
%219 = load i32, i32* %27, align 4
%220 = icmp eq i32 %219, 0
br i1 %220, label %221, label %230
; <label>:221: ; preds = %215
%222 = load i32, i32* %18, align 4
%223 = load i32, i32* %6, align 4
%224 = icmp slt i32 %222, %223
br i1 %224, label %225, label %230
; <label>:225: ; preds = %221
%226 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%227 = load i32, i32* %18, align 4
%228 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"* %4, i32 %227) #10
%229 = load float, float* %19, align 4
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %226, float* %228, float %229) #10
br label %230
; <label>:230: ; preds = %225, %221, %215
%231 = bitcast i32* %27 to i8*
call void @llvm.lifetime.end(i64 4, i8* %231) #9
%232 = bitcast float* %19 to i8*
call void @llvm.lifetime.end(i64 4, i8* %232) #9
%233 = bitcast i32* %18 to i8*
call void @llvm.lifetime.end(i64 4, i8* %233) #9
%234 = bitcast i32* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %234) #9
%235 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %235) #9
%236 = bitcast i32* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %236) #9
br label %237
; <label>:237: ; preds = %230
%238 = load i32, i32* %13, align 4
%239 = add nsw i32 %238, 32
store i32 %239, i32* %13, align 4
br label %74
; <label>:240: ; preds = %78
%241 = bitcast i32* %12 to i8*
call void @llvm.lifetime.end(i64 4, i8* %241) #9
%242 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %242) #9
%243 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %243) #9
%244 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %244) #9
%245 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %245) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca i32
%15 = alloca i32, align 4
%16 = alloca i32, align 4
%17 = alloca i32, align 4
%18 = alloca i32, align 4
%19 = alloca float, align 4
%20 = alloca i32, align 4
%21 = alloca i32, align 4
%22 = alloca i32, align 4
%23 = alloca i32, align 4
%24 = alloca i32, align 4
%25 = alloca i32, align 4
%26 = alloca i32, align 4
%27 = alloca i32, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%29 = icmp eq i32 %28, 256
br i1 %29, label %30, label %31
; <label>:30: ; preds = %5
br label %32
; <label>:31: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%34 = icmp eq i32 %33, 1
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%44 = icmp eq i32 %43, 32
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%49 = icmp eq i32 %48, 1
br i1 %49, label %50, label %51
; <label>:50: ; preds = %47
br label %52
; <label>:51: ; preds = %47
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %52
; <label>:52: ; preds = %51, %50
%53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%54 = icmp eq i32 %53, 1
br i1 %54, label %55, label %56
; <label>:55: ; preds = %52
br label %57
; <label>:56: ; preds = %52
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %57
; <label>:57: ; preds = %56, %55
%58 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %58) #9
store i32 16, i32* %8, align 4
%59 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %7, align 4
%61 = add nsw i32 %60, 32768
%62 = sub nsw i32 %61, 1
%63 = sdiv i32 %62, 32768
store i32 %63, i32* %9, align 4
%64 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %64) #9
%65 = load i32, i32* %9, align 4
%66 = load i32, i32* %6, align 4
%67 = mul nsw i32 %65, %66
store i32 %67, i32* %10, align 4
%68 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %68) #9
%69 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %69, i32* %11, align 4
%70 = bitcast i32* %12 to i8*
call void @llvm.lifetime.start(i64 4, i8* %70) #9
%71 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %71, i32* %12, align 4
%72 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %72) #9
%73 = load i32, i32* %11, align 4
store i32 %73, i32* %13, align 4
br label %74
; <label>:74: ; preds = %232, %57
%75 = load i32, i32* %13, align 4
%76 = load i32, i32* %10, align 4
%77 = icmp slt i32 %75, %76
br i1 %77, label %80, label %78
; <label>:78: ; preds = %74
store i32 2, i32* %14, align 4
%79 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %79) #9
br label %235
; <label>:80: ; preds = %74
%81 = bitcast i32* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %81) #9
%82 = load i32, i32* %13, align 4
%83 = load i32, i32* %9, align 4
%84 = srem i32 %82, %83
store i32 %84, i32* %15, align 4
%85 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = load i32, i32* %13, align 4
%87 = load i32, i32* %9, align 4
%88 = sdiv i32 %86, %87
store i32 %88, i32* %16, align 4
%89 = bitcast i32* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %89) #9
%90 = load i32, i32* %15, align 4
%91 = mul nsw i32 %90, 256
%92 = mul nsw i32 %91, 128
%93 = load i32, i32* %12, align 4
%94 = add nsw i32 %92, %93
store i32 %94, i32* %17, align 4
%95 = bitcast i32* %18 to i8*
call void @llvm.lifetime.start(i64 4, i8* %95) #9
%96 = load i32, i32* %16, align 4
store i32 %96, i32* %18, align 4
%97 = bitcast float* %19 to i8*
call void @llvm.lifetime.start(i64 4, i8* %97) #9
%98 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10
store float %98, float* %19, align 4
%99 = load i32, i32* %18, align 4
%100 = load i32, i32* %6, align 4
%101 = icmp slt i32 %99, %100
br i1 %101, label %102, label %195
; <label>:102: ; preds = %80
%103 = bitcast i32* %20 to i8*
call void @llvm.lifetime.start(i64 4, i8* %103) #9
store i32 0, i32* %20, align 4
br label %104
; <label>:104: ; preds = %189, %102
%105 = load i32, i32* %20, align 4
%106 = icmp slt i32 %105, 128
br i1 %106, label %108, label %107
; <label>:107: ; preds = %104
store i32 5, i32* %14, align 4
br label %192
; <label>:108: ; preds = %104
%109 = bitcast i32* %21 to i8*
call void @llvm.lifetime.start(i64 4, i8* %109) #9
%110 = load i32, i32* %17, align 4
%111 = load i32, i32* %20, align 4
%112 = add nsw i32 %111, 16
%113 = sub nsw i32 %112, 1
%114 = mul nsw i32 256, %113
%115 = add nsw i32 %110, %114
store i32 %115, i32* %21, align 4
%116 = load i32, i32* %21, align 4
%117 = load i32, i32* %7, align 4
%118 = icmp sge i32 %116, %117
br i1 %118, label %119, label %156
; <label>:119: ; preds = %108
%120 = bitcast i32* %22 to i8*
call void @llvm.lifetime.start(i64 4, i8* %120) #9
store i32 0, i32* %22, align 4
br label %121
; <label>:121: ; preds = %150, %119
%122 = load i32, i32* %22, align 4
%123 = icmp slt i32 %122, 15
br i1 %123, label %125, label %124
; <label>:124: ; preds = %121
store i32 8, i32* %14, align 4
br label %153
; <label>:125: ; preds = %121
%126 = bitcast i32* %23 to i8*
call void @llvm.lifetime.start(i64 4, i8* %126) #9
%127 = load i32, i32* %17, align 4
%128 = load i32, i32* %20, align 4
%129 = load i32, i32* %22, align 4
%130 = add nsw i32 %128, %129
%131 = mul nsw i32 256, %130
%132 = add nsw i32 %127, %131
store i32 %132, i32* %23, align 4
%133 = load i32, i32* %23, align 4
%134 = load i32, i32* %7, align 4
%135 = icmp sge i32 %133, %134
br i1 %135, label %136, label %137
; <label>:136: ; preds = %125
store i32 8, i32* %14, align 4
br label %146
; <label>:137: ; preds = %125
%138 = load float, float* %19, align 4
%139 = load i32, i32* %18, align 4
%140 = load i32, i32* %7, align 4
%141 = mul nsw i32 %139, %140
%142 = load i32, i32* %23, align 4
%143 = add nsw i32 %141, %142
%144 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %143) #10
%145 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %138, float %144) #10
store float %145, float* %19, align 4
store i32 0, i32* %14, align 4
br label %146
; <label>:146: ; preds = %137, %136
%147 = bitcast i32* %23 to i8*
call void @llvm.lifetime.end(i64 4, i8* %147) #9
%148 = load i32, i32* %14, align 4
switch i32 %148, label %153 [
i32 0, label %149
]
; <label>:149: ; preds = %146
br label %150
; <label>:150: ; preds = %149
%151 = load i32, i32* %22, align 4
%152 = add nsw i32 %151, 1
store i32 %152, i32* %22, align 4
br label %121, !llvm.loop !84
; <label>:153: ; preds = %146, %124
%154 = bitcast i32* %22 to i8*
call void @llvm.lifetime.end(i64 4, i8* %154) #9
br label %155
; <label>:155: ; preds = %153
store i32 5, i32* %14, align 4
br label %185
; <label>:156: ; preds = %108
%157 = bitcast i32* %24 to i8*
call void @llvm.lifetime.start(i64 4, i8* %157) #9
store i32 0, i32* %24, align 4
br label %158
; <label>:158: ; preds = %180, %156
%159 = load i32, i32* %24, align 4
%160 = icmp slt i32 %159, 16
br i1 %160, label %163, label %161
; <label>:161: ; preds = %158
store i32 11, i32* %14, align 4
%162 = bitcast i32* %24 to i8*
call void @llvm.lifetime.end(i64 4, i8* %162) #9
br label %183
; <label>:163: ; preds = %158
%164 = bitcast i32* %25 to i8*
call void @llvm.lifetime.start(i64 4, i8* %164) #9
%165 = load i32, i32* %17, align 4
%166 = load i32, i32* %20, align 4
%167 = load i32, i32* %24, align 4
%168 = add nsw i32 %166, %167
%169 = mul nsw i32 256, %168
%170 = add nsw i32 %165, %169
store i32 %170, i32* %25, align 4
%171 = load float, float* %19, align 4
%172 = load i32, i32* %18, align 4
%173 = load i32, i32* %7, align 4
%174 = mul nsw i32 %172, %173
%175 = load i32, i32* %25, align 4
%176 = add nsw i32 %174, %175
%177 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %176) #10
%178 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %171, float %177) #10
store float %178, float* %19, align 4
%179 = bitcast i32* %25 to i8*
call void @llvm.lifetime.end(i64 4, i8* %179) #9
br label %180
; <label>:180: ; preds = %163
%181 = load i32, i32* %24, align 4
%182 = add nsw i32 %181, 1
store i32 %182, i32* %24, align 4
br label %158, !llvm.loop !85
; <label>:183: ; preds = %161
br label %184
; <label>:184: ; preds = %183
store i32 0, i32* %14, align 4
br label %185
; <label>:185: ; preds = %184, %155
%186 = bitcast i32* %21 to i8*
call void @llvm.lifetime.end(i64 4, i8* %186) #9
%187 = load i32, i32* %14, align 4
switch i32 %187, label %192 [
i32 0, label %188
]
; <label>:188: ; preds = %185
br label %189
; <label>:189: ; preds = %188
%190 = load i32, i32* %20, align 4
%191 = add nsw i32 %190, 16
store i32 %191, i32* %20, align 4
br label %104, !llvm.loop !86
; <label>:192: ; preds = %185, %107
%193 = bitcast i32* %20 to i8*
call void @llvm.lifetime.end(i64 4, i8* %193) #9
br label %194
; <label>:194: ; preds = %192
br label %195
; <label>:195: ; preds = %194, %80
%196 = bitcast i32* %26 to i8*
call void @llvm.lifetime.start(i64 4, i8* %196) #9
store i32 16, i32* %26, align 4
br label %197
; <label>:197: ; preds = %208, %195
%198 = load i32, i32* %26, align 4
%199 = icmp sgt i32 %198, 0
br i1 %199, label %202, label %200
; <label>:200: ; preds = %197
store i32 14, i32* %14, align 4
%201 = bitcast i32* %26 to i8*
call void @llvm.lifetime.end(i64 4, i8* %201) #9
br label %211
; <label>:202: ; preds = %197
%203 = load float, float* %19, align 4
%204 = load i32, i32* %26, align 4
%205 = call float @_ZL11__shfl_downfji(float %203, i32 %204, i32 32) #10
%206 = load float, float* %19, align 4
%207 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %205, float %206) #10
store float %207, float* %19, align 4
br label %208
; <label>:208: ; preds = %202
%209 = load i32, i32* %26, align 4
%210 = sdiv i32 %209, 2
store i32 %210, i32* %26, align 4
br label %197, !llvm.loop !87
; <label>:211: ; preds = %200
%212 = bitcast i32* %27 to i8*
call void @llvm.lifetime.start(i64 4, i8* %212) #9
%213 = load i32, i32* %12, align 4
%214 = and i32 %213, 31
store i32 %214, i32* %27, align 4
%215 = load i32, i32* %27, align 4
%216 = icmp eq i32 %215, 0
br i1 %216, label %217, label %225
; <label>:217: ; preds = %211
%218 = load i32, i32* %18, align 4
%219 = load i32, i32* %6, align 4
%220 = icmp slt i32 %218, %219
br i1 %220, label %221, label %225
; <label>:221: ; preds = %217
%222 = load i32, i32* %18, align 4
%223 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"* %4, i32 %222) #10
%224 = load float, float* %19, align 4
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %223, float %224) #10
br label %225
; <label>:225: ; preds = %221, %217, %211
%226 = bitcast i32* %27 to i8*
call void @llvm.lifetime.end(i64 4, i8* %226) #9
%227 = bitcast float* %19 to i8*
call void @llvm.lifetime.end(i64 4, i8* %227) #9
%228 = bitcast i32* %18 to i8*
call void @llvm.lifetime.end(i64 4, i8* %228) #9
%229 = bitcast i32* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %229) #9
%230 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %230) #9
%231 = bitcast i32* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %231) #9
br label %232
; <label>:232: ; preds = %225
%233 = load i32, i32* %13, align 4
%234 = add nsw i32 %233, 32
store i32 %234, i32* %13, align 4
br label %74
; <label>:235: ; preds = %78
%236 = bitcast i32* %12 to i8*
call void @llvm.lifetime.end(i64 4, i8* %236) #9
%237 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %237) #9
%238 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %238) #9
%239 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %239) #9
%240 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %240) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32
%13 = alloca i32, align 4
%14 = alloca i32, align 4
%15 = alloca float, align 4
%16 = alloca i32, align 4
%17 = alloca float, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%19 = icmp eq i32 %18, 256
br i1 %19, label %20, label %21
; <label>:20: ; preds = %5
br label %22
; <label>:21: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %22
; <label>:22: ; preds = %21, %20
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%24 = icmp eq i32 %23, 1
br i1 %24, label %25, label %26
; <label>:25: ; preds = %22
br label %27
; <label>:26: ; preds = %22
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %27
; <label>:27: ; preds = %26, %25
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%29 = icmp eq i32 %28, 1
br i1 %29, label %30, label %31
; <label>:30: ; preds = %27
br label %32
; <label>:31: ; preds = %27
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%34 = icmp eq i32 %33, 128
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%44 = icmp eq i32 %43, 1
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %48) #9
%49 = load i32, i32* %6, align 4
%50 = add nsw i32 %49, 16
%51 = sub nsw i32 %50, 1
%52 = sdiv i32 %51, 16
%53 = load i32, i32* %7, align 4
%54 = mul nsw i32 %52, %53
store i32 %54, i32* %8, align 4
%55 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %55) #9
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %56, i32* %9, align 4
%57 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %57) #9
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %58, i32* %10, align 4
%59 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %9, align 4
%61 = mul nsw i32 %60, 256
%62 = load i32, i32* %10, align 4
%63 = add nsw i32 %61, %62
store i32 %63, i32* %11, align 4
br label %64
; <label>:64: ; preds = %135, %47
%65 = load i32, i32* %11, align 4
%66 = load i32, i32* %8, align 4
%67 = icmp slt i32 %65, %66
br i1 %67, label %70, label %68
; <label>:68: ; preds = %64
store i32 2, i32* %12, align 4
%69 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %69) #9
br label %138
; <label>:70: ; preds = %64
%71 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %71) #9
%72 = load i32, i32* %11, align 4
%73 = load i32, i32* %7, align 4
%74 = srem i32 %72, %73
store i32 %74, i32* %13, align 4
%75 = bitcast i32* %14 to i8*
call void @llvm.lifetime.start(i64 4, i8* %75) #9
%76 = load i32, i32* %11, align 4
%77 = load i32, i32* %7, align 4
%78 = sdiv i32 %76, %77
%79 = load i32, i32* %6, align 4
%80 = add nsw i32 %79, 16
%81 = sub nsw i32 %80, 1
%82 = sdiv i32 %81, 16
%83 = srem i32 %78, %82
%84 = mul nsw i32 %83, 16
store i32 %84, i32* %14, align 4
%85 = bitcast float* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%87 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %86) #10
store float %87, float* %15, align 4
%88 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %88) #9
store i32 0, i32* %16, align 4
br label %89
; <label>:89: ; preds = %124, %70
%90 = load i32, i32* %16, align 4
%91 = icmp slt i32 %90, 16
br i1 %91, label %94, label %92
; <label>:92: ; preds = %89
store i32 5, i32* %12, align 4
%93 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %93) #9
br label %127
; <label>:94: ; preds = %89
%95 = bitcast float* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %95) #9
%96 = load i32, i32* %13, align 4
%97 = load i32, i32* %7, align 4
%98 = icmp slt i32 %96, %97
br i1 %98, label %99, label %114
; <label>:99: ; preds = %94
%100 = load i32, i32* %14, align 4
%101 = load i32, i32* %16, align 4
%102 = add nsw i32 %100, %101
%103 = load i32, i32* %6, align 4
%104 = icmp slt i32 %102, %103
br i1 %104, label %105, label %114
; <label>:105: ; preds = %99
%106 = load i32, i32* %14, align 4
%107 = load i32, i32* %16, align 4
%108 = add nsw i32 %106, %107
%109 = load i32, i32* %7, align 4
%110 = mul nsw i32 %108, %109
%111 = load i32, i32* %13, align 4
%112 = add nsw i32 %110, %111
%113 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %112) #10
br label %117
; <label>:114: ; preds = %99, %94
%115 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%116 = call float @_ZN5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %115) #10
br label %117
; <label>:117: ; preds = %114, %105
%118 = phi float [ %113, %105 ], [ %116, %114 ]
store float %118, float* %17, align 4
%119 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%120 = load float, float* %15, align 4
%121 = load float, float* %17, align 4
%122 = call float @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEEclEff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %119, float %120, float %121) #10
store float %122, float* %15, align 4
%123 = bitcast float* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %123) #9
br label %124
; <label>:124: ; preds = %117
%125 = load i32, i32* %16, align 4
%126 = add nsw i32 %125, 1
store i32 %126, i32* %16, align 4
br label %89
; <label>:127: ; preds = %92
%128 = bitcast %"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* %0 to %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"*
%129 = load i32, i32* %13, align 4
%130 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"* %4, i32 %129) #10
%131 = load float, float* %15, align 4
call void @_ZNK5Eigen8internal12_GLOBAL__N_112FnSumReducerINS1_8IdentityEE13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer"* %128, float* %130, float %131) #10
%132 = bitcast float* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %132) #9
%133 = bitcast i32* %14 to i8*
call void @llvm.lifetime.end(i64 4, i8* %133) #9
%134 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %134) #9
br label %135
; <label>:135: ; preds = %127
%136 = load i32, i32* %11, align 4
%137 = add nsw i32 %136, 32768
store i32 %137, i32* %11, align 4
br label %64
; <label>:138: ; preds = %68
%139 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %139) #9
%140 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %140) #9
%141 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %141) #9
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #0 comdat {
%6 = alloca i32, align 4
%7 = alloca i32, align 4
%8 = alloca i32, align 4
%9 = alloca i32, align 4
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32
%13 = alloca i32, align 4
%14 = alloca i32, align 4
%15 = alloca float, align 4
%16 = alloca i32, align 4
%17 = alloca float, align 4
store i32 %2, i32* %6, align 4
store i32 %3, i32* %7, align 4
%18 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #10
%19 = icmp eq i32 %18, 256
br i1 %19, label %20, label %21
; <label>:20: ; preds = %5
br label %22
; <label>:21: ; preds = %5
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %22
; <label>:22: ; preds = %21, %20
%23 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #10
%24 = icmp eq i32 %23, 1
br i1 %24, label %25, label %26
; <label>:25: ; preds = %22
br label %27
; <label>:26: ; preds = %22
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %27
; <label>:27: ; preds = %26, %25
%28 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_zEv() #10
%29 = icmp eq i32 %28, 1
br i1 %29, label %30, label %31
; <label>:30: ; preds = %27
br label %32
; <label>:31: ; preds = %27
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %32
; <label>:32: ; preds = %31, %30
%33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #10
%34 = icmp eq i32 %33, 128
br i1 %34, label %35, label %36
; <label>:35: ; preds = %32
br label %37
; <label>:36: ; preds = %32
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %37
; <label>:37: ; preds = %36, %35
%38 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #10
%39 = icmp eq i32 %38, 1
br i1 %39, label %40, label %41
; <label>:40: ; preds = %37
br label %42
; <label>:41: ; preds = %37
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %42
; <label>:42: ; preds = %41, %40
%43 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_zEv() #10
%44 = icmp eq i32 %43, 1
br i1 %44, label %45, label %46
; <label>:45: ; preds = %42
br label %47
; <label>:46: ; preds = %42
call void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i32 0, i32 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i32 0, i32 0)) #10
br label %47
; <label>:47: ; preds = %46, %45
%48 = bitcast i32* %8 to i8*
call void @llvm.lifetime.start(i64 4, i8* %48) #9
%49 = load i32, i32* %6, align 4
%50 = add nsw i32 %49, 16
%51 = sub nsw i32 %50, 1
%52 = sdiv i32 %51, 16
%53 = load i32, i32* %7, align 4
%54 = mul nsw i32 %52, %53
store i32 %54, i32* %8, align 4
%55 = bitcast i32* %9 to i8*
call void @llvm.lifetime.start(i64 4, i8* %55) #9
%56 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #10
store i32 %56, i32* %9, align 4
%57 = bitcast i32* %10 to i8*
call void @llvm.lifetime.start(i64 4, i8* %57) #9
%58 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #10
store i32 %58, i32* %10, align 4
%59 = bitcast i32* %11 to i8*
call void @llvm.lifetime.start(i64 4, i8* %59) #9
%60 = load i32, i32* %9, align 4
%61 = mul nsw i32 %60, 256
%62 = load i32, i32* %10, align 4
%63 = add nsw i32 %61, %62
store i32 %63, i32* %11, align 4
br label %64
; <label>:64: ; preds = %131, %47
%65 = load i32, i32* %11, align 4
%66 = load i32, i32* %8, align 4
%67 = icmp slt i32 %65, %66
br i1 %67, label %70, label %68
; <label>:68: ; preds = %64
store i32 2, i32* %12, align 4
%69 = bitcast i32* %11 to i8*
call void @llvm.lifetime.end(i64 4, i8* %69) #9
br label %134
; <label>:70: ; preds = %64
%71 = bitcast i32* %13 to i8*
call void @llvm.lifetime.start(i64 4, i8* %71) #9
%72 = load i32, i32* %11, align 4
%73 = load i32, i32* %7, align 4
%74 = srem i32 %72, %73
store i32 %74, i32* %13, align 4
%75 = bitcast i32* %14 to i8*
call void @llvm.lifetime.start(i64 4, i8* %75) #9
%76 = load i32, i32* %11, align 4
%77 = load i32, i32* %7, align 4
%78 = sdiv i32 %76, %77
%79 = load i32, i32* %6, align 4
%80 = add nsw i32 %79, 16
%81 = sub nsw i32 %80, 1
%82 = sdiv i32 %81, 16
%83 = srem i32 %78, %82
%84 = mul nsw i32 %83, 16
store i32 %84, i32* %14, align 4
%85 = bitcast float* %15 to i8*
call void @llvm.lifetime.start(i64 4, i8* %85) #9
%86 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10
store float %86, float* %15, align 4
%87 = bitcast i32* %16 to i8*
call void @llvm.lifetime.start(i64 4, i8* %87) #9
store i32 0, i32* %16, align 4
br label %88
; <label>:88: ; preds = %121, %70
%89 = load i32, i32* %16, align 4
%90 = icmp slt i32 %89, 16
br i1 %90, label %93, label %91
; <label>:91: ; preds = %88
store i32 5, i32* %12, align 4
%92 = bitcast i32* %16 to i8*
call void @llvm.lifetime.end(i64 4, i8* %92) #9
br label %124
; <label>:93: ; preds = %88
%94 = bitcast float* %17 to i8*
call void @llvm.lifetime.start(i64 4, i8* %94) #9
%95 = load i32, i32* %13, align 4
%96 = load i32, i32* %7, align 4
%97 = icmp slt i32 %95, %96
br i1 %97, label %98, label %113
; <label>:98: ; preds = %93
%99 = load i32, i32* %14, align 4
%100 = load i32, i32* %16, align 4
%101 = add nsw i32 %99, %100
%102 = load i32, i32* %6, align 4
%103 = icmp slt i32 %101, %102
br i1 %103, label %104, label %113
; <label>:104: ; preds = %98
%105 = load i32, i32* %14, align 4
%106 = load i32, i32* %16, align 4
%107 = add nsw i32 %105, %106
%108 = load i32, i32* %7, align 4
%109 = mul nsw i32 %107, %108
%110 = load i32, i32* %13, align 4
%111 = add nsw i32 %109, %110
%112 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.13"* %1, i32 %111) #10
br label %115
; <label>:113: ; preds = %98, %93
%114 = call float @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer12bottom_valueEv(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0) #10
br label %115
; <label>:115: ; preds = %113, %104
%116 = phi float [ %112, %104 ], [ %114, %113 ]
store float %116, float* %17, align 4
%117 = load float, float* %15, align 4
%118 = load float, float* %17, align 4
%119 = call float @_ZNK5Eigen8internal12_GLOBAL__N_114CudaMaxReducerclEff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float %117, float %118) #10
store float %119, float* %15, align 4
%120 = bitcast float* %17 to i8*
call void @llvm.lifetime.end(i64 4, i8* %120) #9
br label %121
; <label>:121: ; preds = %115
%122 = load i32, i32* %16, align 4
%123 = add nsw i32 %122, 1
store i32 %123, i32* %16, align 4
br label %88
; <label>:124: ; preds = %91
%125 = load i32, i32* %13, align 4
%126 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.16"* %4, i32 %125) #10
%127 = load float, float* %15, align 4
call void @_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, float* %126, float %127) #10
%128 = bitcast float* %15 to i8*
call void @llvm.lifetime.end(i64 4, i8* %128) #9
%129 = bitcast i32* %14 to i8*
call void @llvm.lifetime.end(i64 4, i8* %129) #9
%130 = bitcast i32* %13 to i8*
call void @llvm.lifetime.end(i64 4, i8* %130) #9
br label %131
; <label>:131: ; preds = %124
%132 = load i32, i32* %11, align 4
%133 = add nsw i32 %132, 32768
store i32 %133, i32* %11, align 4
br label %64
; <label>:134: ; preds = %68
%135 = bitcast i32* %10 to i8*
call void @llvm.lifetime.end(i64 4, i8* %135) #9
%136 = bitcast i32* %9 to i8*
call void @llvm.lifetime.end(i64 4, i8* %136) #9
%137 = bitcast i32* %8 to i8*
call void @llvm.lifetime.end(i64 4, i8* %137) #9
ret void
}
; Function Attrs: alwaysinline inlinehint
define internal i32 @__nv_umulhi(i32, i32) #8 {
%3 = call i32 @llvm.nvvm.mulhi.ui(i32 %0, i32 %1)
ret i32 %3
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.mulhi.ui(i32, i32) #3
; Function Attrs: alwaysinline inlinehint
define internal float @__nv_fmaxf(float, float) #8 {
%3 = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
%4 = icmp ne i32 %3, 0
br i1 %4, label %5, label %7
; <label>:5: ; preds = %2
%6 = call float @llvm.nvvm.fmax.ftz.f(float %0, float %1)
br label %9
; <label>:7: ; preds = %2
%8 = call float @llvm.nvvm.fmax.f(float %0, float %1)
br label %9
; <label>:9: ; preds = %7, %5
%10 = phi float [ %6, %5 ], [ %8, %7 ]
ret float %10
}
declare i32 @__nvvm_reflect(i8*)
; Function Attrs: nounwind readnone
declare float @llvm.nvvm.fmax.ftz.f(float, float) #3
; Function Attrs: nounwind readnone
declare float @llvm.nvvm.fmax.f(float, float) #3
attributes #0 = { convergent nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
attributes #2 = { alwaysinline convergent inlinehint nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind readnone }
attributes #4 = { convergent inlinehint nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { convergent nounwind }
attributes #6 = { argmemonly nounwind readonly }
attributes #7 = { convergent noreturn "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #8 = { alwaysinline inlinehint }
attributes #9 = { nounwind }
attributes #10 = { convergent }
attributes #11 = { convergent noreturn }
!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !38, !40, !40, !40, !40, !41, !41, !40}
!llvm.module.flags = !{!42, !43}
!llvm.ident = !{!44}
!nvvm.internalize.after.link = !{}
!nvvmir.version = !{!45}
!0 = !{void (float, i32, float*)* @_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_, !"kernel", i32 1}
!1 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, float*)* @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!2 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, i32, float*)* @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!3 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, i32, float*)* @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!4 = !{void (%"struct.Eigen::TensorEvaluator.5"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!5 = !{void (%"struct.Eigen::TensorEvaluator.5"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!6 = !{void (float, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_, !"kernel", i32 1}
!7 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!8 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!9 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!10 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!11 = !{void (%"struct.Eigen::TensorEvaluator.6"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!12 = !{void (%"struct.Eigen::TensorEvaluator.6"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!13 = !{void (%"struct.Eigen::TensorEvaluator.11"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!14 = !{void (%"struct.Eigen::TensorEvaluator.11"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!15 = !{void (float, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_, !"kernel", i32 1}
!16 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!17 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!18 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!19 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!20 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, float*)* @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!21 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, i32, float*)* @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!22 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, i32, float*)* @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!23 = !{void (%"struct.Eigen::TensorEvaluator.14"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!24 = !{void (%"struct.Eigen::TensorEvaluator.14"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!25 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!26 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!27 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!28 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!29 = !{void (%"struct.Eigen::TensorEvaluator.15"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!30 = !{void (%"struct.Eigen::TensorEvaluator.15"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!31 = !{void (%"struct.Eigen::TensorEvaluator.24"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!32 = !{void (%"struct.Eigen::TensorEvaluator.24"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!33 = !{void (float, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_, !"kernel", i32 1}
!34 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!35 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!36 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!37 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!38 = !{null, !"align", i32 8}
!39 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!40 = !{null, !"align", i32 16}
!41 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!42 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!43 = !{i32 1, !"PIC Level", i32 2}
!44 = !{!"clang version google3-trunk (trunk r271374)"}
!45 = !{i32 1, i32 2}
!46 = distinct !{!46, !47}
!47 = !{!"llvm.loop.unroll.count", i32 8}
!48 = distinct !{!48, !49}
!49 = !{!"llvm.loop.unroll.enable"}
!50 = !{i32 457534}
!51 = distinct !{!51, !49}
!52 = distinct !{!52, !49}
!53 = distinct !{!53, !49}
!54 = distinct !{!54, !55}
!55 = !{!"llvm.loop.unroll.disable"}
!56 = distinct !{!56, !49}
!57 = distinct !{!57, !49}
!58 = distinct !{!58, !49}
!59 = distinct !{!59, !55}
!60 = distinct !{!60, !49}
!61 = distinct !{!61, !49}
!62 = distinct !{!62, !49}
!63 = distinct !{!63, !55}
!64 = distinct !{!64, !49}
!65 = distinct !{!65, !49}
!66 = distinct !{!66, !49}
!67 = distinct !{!67, !55}
!68 = distinct !{!68, !49}
!69 = distinct !{!69, !47}
!70 = distinct !{!70, !49}
!71 = distinct !{!71, !49}
!72 = distinct !{!72, !49}
!73 = distinct !{!73, !49}
!74 = distinct !{!74, !55}
!75 = distinct !{!75, !49}
!76 = distinct !{!76, !49}
!77 = distinct !{!77, !49}
!78 = distinct !{!78, !55}
!79 = distinct !{!79, !49}
!80 = distinct !{!80, !49}
!81 = distinct !{!81, !49}
!82 = distinct !{!82, !55}
!83 = distinct !{!83, !49}
!84 = distinct !{!84, !49}
!85 = distinct !{!85, !49}
!86 = distinct !{!86, !55}
!87 = distinct !{!87, !49}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment